diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index c8ce3aab3f303..e03ce8c06fed5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -67,7 +67,7 @@ static cl::opt static cl::opt GCNTrackers( "amdgpu-use-amdgpu-trackers", cl::Hidden, cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), - cl::init(false)); + cl::init(true)); static cl::opt PendingQueueLimit( "amdgpu-scheduler-pending-queue-limit", cl::Hidden, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll index b67080bd4798d..cb4db0bac4730 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll @@ -356,62 +356,62 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_ushort v17, v[6:7] ; GFX8-NEXT: flat_load_ushort v18, v[8:9] -; GFX8-NEXT: flat_load_ushort v19, v[10:11] -; GFX8-NEXT: flat_load_ushort v20, v[12:13] -; GFX8-NEXT: flat_load_ushort v21, v[14:15] -; GFX8-NEXT: flat_load_ushort v22, v[0:1] +; GFX8-NEXT: flat_load_ushort v10, v[10:11] +; GFX8-NEXT: flat_load_ushort v11, v[12:13] +; GFX8-NEXT: flat_load_ushort v12, v[14:15] +; GFX8-NEXT: flat_load_ushort v13, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2 +; GFX8-NEXT: flat_load_ushort v14, v[2:3] +; GFX8-NEXT: flat_load_ushort v15, v[0:1] +; GFX8-NEXT: flat_load_ushort v19, v[6:7] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 8, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 10, v2 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 8, v2 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v12, vcc, 10, v2 -; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 12, v2 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_ushort v2, v[2:3] -; GFX8-NEXT: flat_load_ushort v3, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 12, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v20, v[0:1] ; GFX8-NEXT: flat_load_ushort v6, v[6:7] ; GFX8-NEXT: flat_load_ushort v7, v[8:9] -; GFX8-NEXT: flat_load_ushort v8, v[10:11] -; GFX8-NEXT: flat_load_ushort v9, v[12:13] -; GFX8-NEXT: flat_load_ushort v10, v[14:15] +; GFX8-NEXT: flat_load_ushort v2, v[2:3] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u16_e32 v2, v16, v2 +; GFX8-NEXT: v_add_u16_e32 v3, v16, v14 ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u16_e32 v3, v17, v3 -; GFX8-NEXT: flat_store_short v[4:5], v2 -; GFX8-NEXT: flat_store_short v[0:1], v3 +; GFX8-NEXT: v_add_u16_e32 v8, v17, v15 +; GFX8-NEXT: flat_store_short v[4:5], v3 +; GFX8-NEXT: flat_store_short v[0:1], v8 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u16_e32 v6, v18, v6 +; GFX8-NEXT: v_add_u16_e32 v9, v18, v19 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GFX8-NEXT: flat_store_short v[0:1], v6 +; GFX8-NEXT: flat_store_short v[0:1], v9 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v4 -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u16_e32 v7, v19, v7 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GFX8-NEXT: flat_store_short v[0:1], v7 +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u16_e32 v10, v10, v20 +; GFX8-NEXT: flat_store_short v[0:1], v10 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v4 ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u16_e32 v8, v20, v8 +; GFX8-NEXT: v_add_u16_e32 v6, v11, v6 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GFX8-NEXT: flat_store_short v[0:1], v8 +; GFX8-NEXT: flat_store_short v[0:1], v6 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 10, v4 ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u16_e32 v9, v21, v9 +; GFX8-NEXT: v_add_u16_e32 v7, v12, v7 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GFX8-NEXT: flat_store_short v[0:1], v9 +; GFX8-NEXT: flat_store_short v[0:1], v7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v4 ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u16_e32 v10, v22, v10 +; GFX8-NEXT: v_add_u16_e32 v2, v13, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; GFX8-NEXT: flat_store_short v[0:1], v10 +; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -513,29 +513,29 @@ define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v14, v[0:1] +; GFX8-NEXT: flat_load_ushort v16, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v4 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_add_u16_e32 v1, v6, v10 ; GFX8-NEXT: v_add_u16_sdwa v2, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e32 v3, v7, v11 -; GFX8-NEXT: v_add_u16_sdwa v10, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v11, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v6, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v7, v8, v12 ; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 +; GFX8-NEXT: v_add_u16_e32 v10, v9, v13 ; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v13, v14, v0 +; GFX8-NEXT: v_add_u16_e32 v11, v16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v10 -; GFX8-NEXT: v_or_b32_e32 v2, v11, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v1, v3, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: flat_store_short v[6:7], v13 +; GFX8-NEXT: flat_store_short v[14:15], v11 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -661,55 +661,55 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr ; GFX8-LABEL: add_v11i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v0 +; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] -; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v2 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 18, v2 -; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 20, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_ushort v14, v[14:15] -; GFX8-NEXT: flat_load_ushort v15, v[16:17] -; GFX8-NEXT: flat_load_ushort v16, v[2:3] -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u16_e32 v17, v6, v10 -; GFX8-NEXT: v_add_u16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 18, v0 -; GFX8-NEXT: v_add_u16_e32 v18, v7, v11 -; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v0 -; GFX8-NEXT: flat_load_ushort v2, v[2:3] -; GFX8-NEXT: flat_load_ushort v3, v[6:7] ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v21, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v4 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; GFX8-NEXT: v_add_u16_e32 v19, v8, v12 +; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] +; GFX8-NEXT: flat_load_ushort v18, v[14:15] +; GFX8-NEXT: flat_load_ushort v16, v[16:17] +; GFX8-NEXT: flat_load_ushort v17, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 18, v2 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v19, v[0:1] +; GFX8-NEXT: flat_load_ushort v20, v[14:15] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 20, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 16, v4 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u16_e32 v1, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v2, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 18, v4 +; GFX8-NEXT: v_add_u16_e32 v3, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v10, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v11, v8, v12 ; GFX8-NEXT: v_add_u16_sdwa v12, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 18, v4 -; GFX8-NEXT: v_add_u16_e32 v20, v9, v13 +; GFX8-NEXT: v_add_u16_e32 v21, v9, v13 ; GFX8-NEXT: v_add_u16_sdwa v13, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v17, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v18, v11 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 20, v4 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 20, v4 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u16_e32 v14, v2, v14 +; GFX8-NEXT: v_add_u16_e32 v18, v18, v19 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v15, v3, v15 -; GFX8-NEXT: v_or_b32_e32 v2, v19, v12 -; GFX8-NEXT: v_or_b32_e32 v3, v20, v13 +; GFX8-NEXT: v_add_u16_e32 v16, v16, v20 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v16, v21, v16 +; GFX8-NEXT: v_add_u16_e32 v17, v17, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v10 +; GFX8-NEXT: v_or_b32_e32 v2, v11, v12 +; GFX8-NEXT: v_or_b32_e32 v3, v21, v13 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: flat_store_short v[6:7], v14 -; GFX8-NEXT: flat_store_short v[8:9], v15 -; GFX8-NEXT: flat_store_short v[10:11], v16 +; GFX8-NEXT: flat_store_short v[14:15], v18 +; GFX8-NEXT: flat_store_short v[6:7], v16 +; GFX8-NEXT: flat_store_short v[8:9], v17 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -794,34 +794,34 @@ define void @add_v12i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[2:3] -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 16, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v2, v6, v10 -; GFX8-NEXT: v_add_u16_sdwa v3, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v10, v7, v11 -; GFX8-NEXT: v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX8-NEXT: v_add_u16_e32 v16, v8, v12 -; GFX8-NEXT: v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v12, v9, v13 +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u16_e32 v0, v6, v10 +; GFX8-NEXT: v_add_u16_sdwa v1, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v2, v7, v11 +; GFX8-NEXT: v_add_u16_sdwa v3, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v6, v8, v12 +; GFX8-NEXT: v_add_u16_sdwa v7, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v8, v9, v13 ; GFX8-NEXT: v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v10, v11 -; GFX8-NEXT: v_or_b32_e32 v2, v16, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v12, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v8, v9 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u16_e32 v6, v14, v16 +; GFX8-NEXT: v_add_u16_sdwa v7, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v8, v15, v17 +; GFX8-NEXT: v_add_u16_sdwa v9, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u16_e32 v8, v6, v14 -; GFX8-NEXT: v_add_u16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v9, v7, v15 -; GFX8-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v4 -; GFX8-NEXT: v_or_b32_e32 v6, v8, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v9, v7 +; GFX8-NEXT: v_or_b32_e32 v7, v8, v9 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7] ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll index 6ea0a9446ff9d..fc42801fd3642 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll @@ -711,8 +711,8 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou ; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25] -; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32 ; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) @@ -737,8 +737,8 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou ; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25] -; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32 ; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) @@ -882,8 +882,8 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x ; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25] -; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32 ; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) @@ -908,8 +908,8 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x ; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) ; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25] -; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:32 ; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll index ea149cc2f4a9e..f6396be103ae5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll @@ -717,9 +717,9 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) { ; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 -; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] @@ -727,9 +727,9 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) { ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 ; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] ; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] -; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] +; GFX6-NEXT: v_fma_f64 v[20:21], -v[8:9], v[16:17], v[18:19] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[20:21], v[10:11], v[16:17] ; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] ; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] @@ -950,9 +950,9 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 -; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] @@ -960,9 +960,9 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 ; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] ; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] -; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] +; GFX6-NEXT: v_fma_f64 v[20:21], -v[8:9], v[16:17], v[18:19] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[20:21], v[10:11], v[16:17] ; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] ; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] @@ -1106,7 +1106,7 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) { ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 +; GFX6-NEXT: v_mov_b32_e32 v20, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 @@ -1115,23 +1115,23 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) { ; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v18 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v20 ; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] +; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 +; GFX6-NEXT: v_mul_f64 v[8:9], v[16:17], v[4:5] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13] +; GFX6-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v20 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 ; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1266,7 +1266,7 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) { ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 +; GFX6-NEXT: v_mov_b32_e32 v20, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 @@ -1275,23 +1275,23 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) { ; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v18 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v20 ; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] +; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 +; GFX6-NEXT: v_mul_f64 v[8:9], v[16:17], v[4:5] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13] +; GFX6-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v20 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 ; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,7 +1493,7 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) { ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 +; GFX6-NEXT: v_mov_b32_e32 v20, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 @@ -1502,23 +1502,23 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) { ; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v18 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v9, v20 ; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] +; GFX6-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[18:19], -v[4:5], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[8:9], v[14:15] ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 +; GFX6-NEXT: v_mul_f64 v[8:9], v[16:17], v[4:5] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[18:19], v[6:7], v[12:13] +; GFX6-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[16:17] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v17, v20 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 ; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9] ; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1730,9 +1730,9 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 -; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] @@ -1740,9 +1740,9 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) { ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 ; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] ; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] -; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] +; GFX6-NEXT: v_fma_f64 v[20:21], -v[8:9], v[16:17], v[18:19] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[20:21], v[10:11], v[16:17] ; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] ; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] ; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 715a7778351cc..28581f2464f01 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -7377,270 +7377,270 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v16 -; GFX6-NEXT: v_not_b32_e32 v18, 63 -; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 64, v19 -; GFX6-NEXT: v_add_i32_e32 v27, vcc, v19, v18 -; GFX6-NEXT: v_lshr_b64 v[23:24], v[0:1], v23 -; GFX6-NEXT: v_lshl_b64 v[25:26], v[2:3], v19 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], v19 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v27 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v19 +; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 +; GFX6-NEXT: v_not_b32_e32 v23, 63 +; GFX6-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v19, v23 +; GFX6-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX6-NEXT: v_lshl_b64 v[17:18], v[0:1], v17 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX6-NEXT: v_cndmask_b32_e32 v17, v17, v21, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v18, v18, v22, vcc ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; GFX6-NEXT: v_or_b32_e32 v19, v23, v25 -; GFX6-NEXT: v_or_b32_e32 v23, v24, v26 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v19, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v23, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5] -; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], 1 -; GFX6-NEXT: v_mov_b32_e32 v17, 0x7f -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v10 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], 1 -; GFX6-NEXT: v_bfi_b32 v10, v16, 0, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v24, 0, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v25, 0, v22, vcc -; GFX6-NEXT: v_add_i32_e32 v16, vcc, v10, v18 -; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 64, v10 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v10 -; GFX6-NEXT: v_lshr_b64 v[10:11], v[0:1], v10 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v21 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v16 -; GFX6-NEXT: v_or_b32_e32 v10, v10, v21 -; GFX6-NEXT: v_or_b32_e32 v11, v11, v22 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v19 +; GFX6-NEXT: v_cndmask_b32_e64 v19, v17, v2, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v18, v18, v3, s[4:5] +; GFX6-NEXT: v_lshr_b64 v[2:3], v[8:9], 1 +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 31, v10 +; GFX6-NEXT: v_mov_b32_e32 v21, 0x7f +; GFX6-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[10:11], 1 +; GFX6-NEXT: v_bfi_b32 v22, v16, 0, v21 +; GFX6-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v22 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[8:9], v10 +; GFX6-NEXT: v_lshr_b64 v[16:17], v[2:3], v22 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v22 +; GFX6-NEXT: v_or_b32_e32 v16, v16, v10 +; GFX6-NEXT: v_add_i32_e64 v10, s[4:5], v22, v23 +; GFX6-NEXT: v_or_b32_e32 v17, v17, v11 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[8:9], v10 +; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22 +; GFX6-NEXT: v_cndmask_b32_e64 v10, v10, v16, s[4:5] +; GFX6-NEXT: v_lshr_b64 v[8:9], v[8:9], v22 +; GFX6-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] +; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v20 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v16 -; GFX6-NEXT: v_lshr_b64 v[10:11], v[4:5], v10 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[6:7], v16 -; GFX6-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX6-NEXT: v_add_i32_e32 v19, vcc, v16, v18 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[4:5] +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v16 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[4:5] +; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v8 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v16 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v16 -; GFX6-NEXT: v_or_b32_e32 v16, v10, v21 -; GFX6-NEXT: v_or_b32_e32 v21, v11, v22 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[4:5], v19 -; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v8, v10, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, v11, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] +; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v16, v23 +; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v8 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v17, v9, v11, vcc +; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v16 +; GFX6-NEXT: v_cndmask_b32_e64 v16, v8, v6, s[4:5] ; GFX6-NEXT: v_lshr_b64 v[8:9], v[12:13], 1 -; GFX6-NEXT: v_lshlrev_b32_e32 v10, 31, v14 -; GFX6-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX6-NEXT: v_or_b32_e32 v3, v18, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14 +; GFX6-NEXT: v_bfi_b32 v18, v20, 0, v21 ; GFX6-NEXT: v_lshr_b64 v[10:11], v[14:15], 1 -; GFX6-NEXT: v_bfi_b32 v14, v20, 0, v17 -; GFX6-NEXT: v_add_i32_e32 v18, vcc, v14, v18 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v14 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 -; GFX6-NEXT: v_lshr_b64 v[12:13], v[10:11], v14 -; GFX6-NEXT: v_lshr_b64 v[14:15], v[8:9], v14 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[10:11], v16 -; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], v18 -; GFX6-NEXT: v_or_b32_e32 v14, v14, v16 -; GFX6-NEXT: v_or_b32_e32 v15, v15, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v24, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v25, v1 -; GFX6-NEXT: v_or_b32_e32 v3, v23, v3 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v6 +; GFX6-NEXT: v_sub_i32_e64 v6, s[6:7], 64, v18 +; GFX6-NEXT: v_lshl_b64 v[12:13], v[10:11], v6 +; GFX6-NEXT: v_lshr_b64 v[14:15], v[8:9], v18 +; GFX6-NEXT: v_cndmask_b32_e64 v17, v17, v7, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v6, s[4:5], v18, v23 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[10:11], v6 +; GFX6-NEXT: v_or_b32_e32 v12, v14, v12 +; GFX6-NEXT: v_or_b32_e32 v13, v15, v13 +; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v18 +; GFX6-NEXT: v_cndmask_b32_e64 v12, v6, v12, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v13, v7, v13, s[4:5] +; GFX6-NEXT: v_lshr_b64 v[6:7], v[10:11], v18 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 +; GFX6-NEXT: v_cndmask_b32_e64 v8, v12, v8, s[6:7] +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v9, v13, v9, s[6:7] +; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, v7, s[4:5] +; GFX6-NEXT: v_or_b32_e32 v2, v19, v2 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v9 -; GFX6-NEXT: v_or_b32_e32 v6, v6, v10 -; GFX6-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX6-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX6-NEXT: v_or_b32_e32 v7, v17, v7 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v16 -; GFX8-NEXT: v_not_b32_e32 v18, 63 -; GFX8-NEXT: v_sub_u32_e32 v23, vcc, 64, v19 -; GFX8-NEXT: v_add_u32_e32 v27, vcc, v19, v18 -; GFX8-NEXT: v_lshrrev_b64 v[23:24], v23, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[25:26], v19, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v27, v[0:1] +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v19 +; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] +; GFX8-NEXT: v_not_b32_e32 v23, 63 +; GFX8-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v19, v23 +; GFX8-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX8-NEXT: v_lshlrev_b64 v[17:18], v17, v[0:1] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v21, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v22, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; GFX8-NEXT: v_or_b32_e32 v19, v23, v25 -; GFX8-NEXT: v_or_b32_e32 v23, v24, v26 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v19, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v23, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5] -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9] -; GFX8-NEXT: v_mov_b32_e32 v17, 0x7f -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] -; GFX8-NEXT: v_bfi_b32 v10, v16, 0, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v24, 0, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v25, 0, v22, vcc -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v10, v18 -; GFX8-NEXT: v_sub_u32_e32 v21, vcc, 64, v10 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v10, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v21, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v16, v[2:3] -; GFX8-NEXT: v_or_b32_e32 v10, v10, v21 -; GFX8-NEXT: v_or_b32_e32 v11, v11, v22 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v19, v17, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v3, s[4:5] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[8:9] +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 31, v10 +; GFX8-NEXT: v_mov_b32_e32 v21, 0x7f +; GFX8-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[10:11] +; GFX8-NEXT: v_bfi_b32 v22, v16, 0, v21 +; GFX8-NEXT: v_sub_u32_e64 v10, s[4:5], 64, v22 +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[8:9] +; GFX8-NEXT: v_lshrrev_b64 v[16:17], v22, v[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v22 +; GFX8-NEXT: v_or_b32_e32 v16, v16, v10 +; GFX8-NEXT: v_add_u32_e64 v10, s[4:5], v22, v23 +; GFX8-NEXT: v_or_b32_e32 v17, v17, v11 +; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[8:9] +; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v16, s[4:5] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v22, v[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v20 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v16 -; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v16, v[6:7] -; GFX8-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, v16, v18 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[4:5] +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v16 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[4:5] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; GFX8-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5] -; GFX8-NEXT: v_or_b32_e32 v16, v10, v21 -; GFX8-NEXT: v_or_b32_e32 v21, v11, v22 -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v19, v[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v16, v23 +; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v8, v[4:5] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v17, v9, v11, vcc +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v16, v8, v6, s[4:5] ; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 31, v14 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v18, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14 +; GFX8-NEXT: v_bfi_b32 v18, v20, 0, v21 ; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[14:15] -; GFX8-NEXT: v_bfi_b32 v14, v20, 0, v17 -; GFX8-NEXT: v_add_u32_e32 v18, vcc, v14, v18 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v14 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 -; GFX8-NEXT: v_lshrrev_b64 v[12:13], v14, v[10:11] -; GFX8-NEXT: v_lshrrev_b64 v[14:15], v14, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] -; GFX8-NEXT: v_lshrrev_b64 v[10:11], v18, v[10:11] -; GFX8-NEXT: v_or_b32_e32 v14, v14, v16 -; GFX8-NEXT: v_or_b32_e32 v15, v15, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v24, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v25, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v23, v3 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v6 +; GFX8-NEXT: v_sub_u32_e64 v6, s[6:7], 64, v18 +; GFX8-NEXT: v_lshlrev_b64 v[12:13], v6, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[14:15], v18, v[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v17, v17, v7, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], v18, v23 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v6, v[10:11] +; GFX8-NEXT: v_or_b32_e32 v12, v14, v12 +; GFX8-NEXT: v_or_b32_e32 v13, v15, v13 +; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v12, v6, v12, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v7, v13, s[4:5] +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v18, v[10:11] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v12, v8, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v9, v13, v9, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, v7, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v2, v19, v2 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v9 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v10 -; GFX8-NEXT: v_or_b32_e32 v7, v7, v11 +; GFX8-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v17, v7 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 -; GFX9-NEXT: v_mov_b32_e32 v24, 0x7f +; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX9-NEXT: v_sub_u32_e32 v17, 64, v19 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] -; GFX9-NEXT: v_lshl_or_b32 v9, v10, 31, v9 -; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX9-NEXT: v_bfi_b32 v25, v16, 0, v24 -; GFX9-NEXT: v_sub_u32_e32 v16, 64, v25 +; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v19 ; GFX9-NEXT: v_or_b32_e32 v22, v18, v22 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] -; GFX9-NEXT: v_lshrrev_b64 v[18:19], v25, v[8:9] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX9-NEXT: v_or_b32_e32 v18, v18, v16 -; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v23 -; GFX9-NEXT: v_or_b32_e32 v19, v19, v17 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v22, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] -; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v25 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] -; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v0, v26, v2 -; GFX9-NEXT: v_or_b32_e32 v2, v17, v8 -; GFX9-NEXT: v_and_b32_e32 v17, 0x7f, v20 -; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] -; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX9-NEXT: v_sub_u32_e32 v3, 64, v17 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v3, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7] -; GFX9-NEXT: v_or_b32_e32 v3, v16, v19 -; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v17 +; GFX9-NEXT: v_lshlrev_b64 v[17:18], v17, v[0:1] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_mov_b32_e32 v23, 0x7f +; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v21, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX9-NEXT: v_lshl_or_b32 v9, v10, 31, v9 +; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_bfi_b32 v24, v16, 0, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v18, v22, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v22, v17, v2, s[4:5] +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v24 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v2, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v24 +; GFX9-NEXT: v_cndmask_b32_e64 v21, v21, v3, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v2, v[10:11] +; GFX9-NEXT: v_or_b32_e32 v16, v18, v16 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v16, v19, v17 +; GFX9-NEXT: v_lshrrev_b64 v[10:11], v24, v[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v16, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v24 +; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v20 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_sub_u32_e32 v8, 64, v16 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v11, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-NEXT: v_add_u32_e32 v8, 0xffffffc0, v16 ; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], v17, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v8, v[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v17, v9, v11, vcc ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v4, v6, vcc -; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v7, vcc -; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] -; GFX9-NEXT: v_bfi_b32 v13, v20, 0, v24 -; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5 -; GFX9-NEXT: v_sub_u32_e32 v10, 64, v13 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v13, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX9-NEXT: v_add_u32_e32 v14, 0xffffffc0, v13 -; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v13, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[6:7], v14, v[6:7] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v4, v16, v4 -; GFX9-NEXT: v_or_b32_e32 v5, v18, v5 -; GFX9-NEXT: v_or_b32_e32 v6, v17, v6 -; GFX9-NEXT: v_or_b32_e32 v7, v12, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v8, v6, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] +; GFX9-NEXT: v_bfi_b32 v18, v20, 0, v23 +; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[14:15] +; GFX9-NEXT: v_lshl_or_b32 v9, v14, 31, v9 +; GFX9-NEXT: v_sub_u32_e32 v6, 64, v18 +; GFX9-NEXT: v_lshlrev_b64 v[12:13], v6, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], v18, v[8:9] +; GFX9-NEXT: v_add_u32_e32 v6, 0xffffffc0, v18 +; GFX9-NEXT: v_cndmask_b32_e64 v17, v17, v7, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v6, v[10:11] +; GFX9-NEXT: v_or_b32_e32 v12, v14, v12 +; GFX9-NEXT: v_or_b32_e32 v13, v15, v13 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v18 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v6, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v7, v13, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v18, v[10:11] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v8, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v13, v9, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, v7, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v2, v22, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v21, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX9-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX9-NEXT: v_or_b32_e32 v7, v17, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshl_v2i128: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 5aa5a6716c152..1fe36252430d4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -7139,272 +7139,272 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: v_mov_b32_e32 v18, 0x7f -; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GFX6-NEXT: v_bfi_b32 v19, v16, 0, v18 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX6-NEXT: v_not_b32_e32 v17, 63 -; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 64, v19 -; GFX6-NEXT: v_add_i32_e32 v27, vcc, v19, v17 -; GFX6-NEXT: v_lshr_b64 v[23:24], v[21:22], v23 -; GFX6-NEXT: v_lshl_b64 v[25:26], v[2:3], v19 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[21:22], v19 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[21:22], v27 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; GFX6-NEXT: v_or_b32_e32 v19, v23, v25 -; GFX6-NEXT: v_or_b32_e32 v23, v24, v26 -; GFX6-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v21, v19, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5] -; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v16 -; GFX6-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v22, v23, vcc -; GFX6-NEXT: v_add_i32_e32 v16, vcc, v2, v17 -; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 64, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5] -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v2 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[8:9], v2 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[10:11], v21 +; GFX6-NEXT: v_mov_b32_e32 v19, 0x7f +; GFX6-NEXT: v_lshrrev_b32_e32 v17, 31, v1 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; GFX6-NEXT: v_bfi_b32 v23, v16, 0, v19 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 +; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23 +; GFX6-NEXT: v_not_b32_e32 v24, 63 +; GFX6-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v23, v24 +; GFX6-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX6-NEXT: v_lshl_b64 v[17:18], v[0:1], v17 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 +; GFX6-NEXT: v_cndmask_b32_e32 v17, v17, v21, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v18, v18, v22, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX6-NEXT: v_and_b32_e32 v22, 0x7f, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v21, v17, v2, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v18, v18, v3, s[4:5] +; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], 64, v22 +; GFX6-NEXT: v_lshl_b64 v[2:3], v[10:11], v2 +; GFX6-NEXT: v_lshr_b64 v[16:17], v[8:9], v22 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v23 +; GFX6-NEXT: v_or_b32_e32 v16, v16, v2 +; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v22, v24 +; GFX6-NEXT: v_or_b32_e32 v17, v17, v3 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v2 +; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22 +; GFX6-NEXT: v_cndmask_b32_e64 v16, v2, v16, s[4:5] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v22 +; GFX6-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[6:7] +; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; GFX6-NEXT: v_or_b32_e32 v21, v2, v21 -; GFX6-NEXT: v_or_b32_e32 v22, v3, v22 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v16 -; GFX6-NEXT: v_bfi_b32 v16, v20, 0, v18 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v22, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v24, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v25, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v19, v8 -; GFX6-NEXT: v_or_b32_e32 v3, v23, v9 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX6-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v16 -; GFX6-NEXT: v_add_i32_e32 v21, vcc, v16, v17 -; GFX6-NEXT: v_lshr_b64 v[10:11], v[8:9], v10 -; GFX6-NEXT: v_lshl_b64 v[18:19], v[6:7], v16 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v16 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v21 +; GFX6-NEXT: v_cndmask_b32_e64 v17, v3, v17, s[4:5] +; GFX6-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 31, v5 +; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; GFX6-NEXT: v_bfi_b32 v16, v20, 0, v19 +; GFX6-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[6:7] +; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX6-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v16 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v22 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v8 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v16 +; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] +; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v16, v24 +; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v8 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX6-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5] +; GFX6-NEXT: v_or_b32_e32 v3, v18, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc -; GFX6-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX6-NEXT: v_cndmask_b32_e64 v10, v4, v6, s[4:5] -; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v20 -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc -; GFX6-NEXT: v_add_i32_e32 v17, vcc, v6, v17 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v6 -; GFX6-NEXT: v_cndmask_b32_e64 v11, v5, v7, s[4:5] -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v6 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[12:13], v6 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[14:15], v8 -; GFX6-NEXT: v_or_b32_e32 v8, v6, v8 -; GFX6-NEXT: v_or_b32_e32 v9, v7, v9 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v17 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; GFX6-NEXT: v_or_b32_e32 v4, v16, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v18, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v10, v8 -; GFX6-NEXT: v_or_b32_e32 v7, v11, v9 +; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 +; GFX6-NEXT: v_cndmask_b32_e32 v17, v9, v11, vcc +; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v16 +; GFX6-NEXT: v_cndmask_b32_e64 v16, v8, v6, s[4:5] +; GFX6-NEXT: v_sub_i32_e64 v6, s[6:7], 64, v18 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[14:15], v6 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[12:13], v18 +; GFX6-NEXT: v_cndmask_b32_e64 v17, v17, v7, s[4:5] +; GFX6-NEXT: v_add_i32_e64 v6, s[4:5], v18, v24 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v6 +; GFX6-NEXT: v_or_b32_e32 v8, v10, v8 +; GFX6-NEXT: v_or_b32_e32 v9, v11, v9 +; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v18 +; GFX6-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5] +; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v18 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 +; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[6:7] +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v9, v9, v13, s[6:7] +; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, v7, s[4:5] +; GFX6-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX6-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX6-NEXT: v_or_b32_e32 v7, v17, v7 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v18, 0x7f -; GFX8-NEXT: v_lshlrev_b64 v[21:22], 1, v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GFX8-NEXT: v_bfi_b32 v19, v16, 0, v18 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX8-NEXT: v_not_b32_e32 v17, 63 -; GFX8-NEXT: v_sub_u32_e32 v23, vcc, 64, v19 -; GFX8-NEXT: v_add_u32_e32 v27, vcc, v19, v17 -; GFX8-NEXT: v_lshrrev_b64 v[23:24], v23, v[21:22] -; GFX8-NEXT: v_lshlrev_b64 v[25:26], v19, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v19, v[21:22] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v27, v[21:22] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 -; GFX8-NEXT: v_or_b32_e32 v19, v23, v25 -; GFX8-NEXT: v_or_b32_e32 v23, v24, v26 -; GFX8-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v21, v19, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v19, v0, v2, s[4:5] -; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v22, v23, vcc -; GFX8-NEXT: v_add_u32_e32 v16, vcc, v2, v17 -; GFX8-NEXT: v_sub_u32_e32 v21, vcc, 64, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v23, v1, v3, s[4:5] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v2, v[10:11] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v21, v[10:11] +; GFX8-NEXT: v_mov_b32_e32 v19, 0x7f +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 31, v1 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX8-NEXT: v_bfi_b32 v23, v16, 0, v19 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 +; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] +; GFX8-NEXT: v_not_b32_e32 v24, 63 +; GFX8-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v23, v24 +; GFX8-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX8-NEXT: v_lshlrev_b64 v[17:18], v17, v[0:1] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v21, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v22, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX8-NEXT: v_and_b32_e32 v22, 0x7f, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v21, v17, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v3, s[4:5] +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], 64, v22 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[16:17], v22, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] +; GFX8-NEXT: v_or_b32_e32 v16, v16, v2 +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v22, v24 +; GFX8-NEXT: v_or_b32_e32 v17, v17, v3 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, v[10:11] +; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v16, v2, v16, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX8-NEXT: v_or_b32_e32 v21, v2, v21 -; GFX8-NEXT: v_or_b32_e32 v22, v3, v22 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v16, v[10:11] -; GFX8-NEXT: v_bfi_b32 v16, v20, 0, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v22, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v24, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v25, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v19, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v23, v9 -; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v16 -; GFX8-NEXT: v_add_u32_e32 v21, vcc, v16, v17 -; GFX8-NEXT: v_lshrrev_b64 v[10:11], v10, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[18:19], v16, v[6:7] -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[8:9], v21, v[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v17, v3, v17, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 31, v5 +; GFX8-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] +; GFX8-NEXT: v_bfi_b32 v16, v20, 0, v19 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v16 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v22, v[10:11] +; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v16, v24 +; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v8, v[4:5] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX8-NEXT: v_or_b32_e32 v10, v10, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v3, v18, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc -; GFX8-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX8-NEXT: v_cndmask_b32_e64 v10, v4, v6, s[4:5] -; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc -; GFX8-NEXT: v_add_u32_e32 v17, vcc, v6, v17 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v5, v7, s[4:5] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v6, v[14:15] -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v6, v[12:13] -; GFX8-NEXT: v_lshlrev_b64 v[8:9], v8, v[14:15] -; GFX8-NEXT: v_or_b32_e32 v8, v6, v8 -; GFX8-NEXT: v_or_b32_e32 v9, v7, v9 -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v17, v[14:15] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v4, v16, v6 -; GFX8-NEXT: v_or_b32_e32 v5, v18, v7 -; GFX8-NEXT: v_or_b32_e32 v6, v10, v8 -; GFX8-NEXT: v_or_b32_e32 v7, v11, v9 +; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v9, v11, vcc +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v16, v8, v6, s[4:5] +; GFX8-NEXT: v_sub_u32_e64 v6, s[6:7], 64, v18 +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v6, v[14:15] +; GFX8-NEXT: v_lshrrev_b64 v[10:11], v18, v[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v17, v17, v7, s[4:5] +; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], v18, v24 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v6, v[14:15] +; GFX8-NEXT: v_or_b32_e32 v8, v10, v8 +; GFX8-NEXT: v_or_b32_e32 v9, v11, v9 +; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5] +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v18, v[14:15] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v13, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, v7, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX8-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v17, v7 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v19, 0x7f -; GFX9-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GFX9-NEXT: v_bfi_b32 v23, v16, 0, v19 -; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v23 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] -; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v26, 0x7f, v16 -; GFX9-NEXT: v_or_b32_e32 v24, v0, v21 -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v26 -; GFX9-NEXT: v_or_b32_e32 v25, v1, v22 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[10:11] -; GFX9-NEXT: v_lshrrev_b64 v[21:22], v26, v[8:9] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX9-NEXT: v_or_b32_e32 v21, v21, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v23 -; GFX9-NEXT: v_or_b32_e32 v22, v22, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[17:18] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v24, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v25, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[4:5] -; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffc0, v26 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v23, v[17:18] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[4:5] -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] -; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v21, v1, v22, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v26, v[10:11] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v9, v21, v9, vcc +; GFX9-NEXT: v_mov_b32_e32 v23, 0x7f +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 31, v1 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: v_bfi_b32 v19, v16, 0, v23 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX9-NEXT: v_sub_u32_e32 v17, 64, v19 +; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v19 +; GFX9-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX9-NEXT: v_lshlrev_b64 v[17:18], v17, v[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v21, vcc +; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v18, v22, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v22, v17, v2, s[4:5] +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v24 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v2, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], v24, v[8:9] +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v24 +; GFX9-NEXT: v_cndmask_b32_e64 v21, v21, v3, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v2, v[10:11] +; GFX9-NEXT: v_or_b32_e32 v16, v18, v16 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v16, v19, v17 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v24 ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; GFX9-NEXT: v_or_b32_e32 v0, v16, v8 -; GFX9-NEXT: v_or_b32_e32 v1, v17, v9 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX9-NEXT: v_bfi_b32 v16, v20, 0, v19 -; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, 64, v16 -; GFX9-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v11 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[10:11], v24, v[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v16, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 31, v5 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] +; GFX9-NEXT: v_bfi_b32 v16, v20, 0, v23 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX9-NEXT: v_sub_u32_e32 v8, 64, v16 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v11, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] -; GFX9-NEXT: v_add_u32_e32 v17, 0xffffffc0, v16 -; GFX9-NEXT: v_or_b32_e32 v10, v4, v10 -; GFX9-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v16, v[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[8:9], v17, v[8:9] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v6, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, 64, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v7, vcc -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, v[12:13] -; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] -; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v10 -; GFX9-NEXT: v_or_b32_e32 v16, v4, v6 -; GFX9-NEXT: v_or_b32_e32 v19, v5, v7 -; GFX9-NEXT: v_lshrrev_b64 v[6:7], v11, v[14:15] -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, v[14:15] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v19, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc -; GFX9-NEXT: v_or_b32_e32 v4, v17, v6 -; GFX9-NEXT: v_or_b32_e32 v5, v18, v7 -; GFX9-NEXT: v_or_b32_e32 v6, v8, v10 -; GFX9-NEXT: v_or_b32_e32 v7, v9, v11 +; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-NEXT: v_add_u32_e32 v8, 0xffffffc0, v16 +; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v8, v[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v16, v8, v6, s[4:5] +; GFX9-NEXT: v_sub_u32_e32 v6, 64, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v9, v11, vcc +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v6, v[14:15] +; GFX9-NEXT: v_lshrrev_b64 v[10:11], v18, v[12:13] +; GFX9-NEXT: v_add_u32_e32 v6, 0xffffffc0, v18 +; GFX9-NEXT: v_cndmask_b32_e64 v17, v17, v7, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v6, v[14:15] +; GFX9-NEXT: v_or_b32_e32 v8, v10, v8 +; GFX9-NEXT: v_or_b32_e32 v9, v11, v9 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v18 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v18, v[14:15] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v13, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, v7, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v2, v22, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v21, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX9-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX9-NEXT: v_or_b32_e32 v7, v17, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshr_v2i128: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index c2129c20e4543..fff1244cf1d98 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -7,33 +7,25 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr addrspace(1) %ptr, i32 %val, i32 %idx) #0 { ; GCN-LABEL: v_insert_v64i32_varidx: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[20:23], s[8:9], 0x0 -; GCN-NEXT: s_load_dwordx2 s[24:25], s[8:9], 0x10 +; GCN-NEXT: s_load_dwordx4 s[28:31], s[8:9], 0x0 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x10 ; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v64, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0 -; GCN-NEXT: s_load_dwordx16 s[52:67], s[22:23], 0x40 -; GCN-NEXT: s_load_dwordx16 s[4:19], s[22:23], 0x80 +; GCN-NEXT: s_load_dwordx16 s[12:27], s[30:31], 0x0 +; GCN-NEXT: s_load_dwordx16 s[52:67], s[30:31], 0x40 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[30:31], 0x80 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NEXT: v_mov_b32_e32 v3, s39 -; GCN-NEXT: v_mov_b32_e32 v4, s40 -; GCN-NEXT: v_mov_b32_e32 v5, s41 -; GCN-NEXT: v_mov_b32_e32 v6, s42 -; GCN-NEXT: v_mov_b32_e32 v7, s43 -; GCN-NEXT: v_mov_b32_e32 v8, s44 -; GCN-NEXT: v_mov_b32_e32 v9, s45 -; GCN-NEXT: v_mov_b32_e32 v10, s46 -; GCN-NEXT: v_mov_b32_e32 v11, s47 -; GCN-NEXT: v_mov_b32_e32 v12, s48 -; GCN-NEXT: v_mov_b32_e32 v13, s49 -; GCN-NEXT: v_mov_b32_e32 v14, s50 -; GCN-NEXT: v_mov_b32_e32 v15, s51 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0 +; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NEXT: v_mov_b32_e32 v4, s16 +; GCN-NEXT: v_mov_b32_e32 v5, s17 +; GCN-NEXT: v_mov_b32_e32 v6, s18 +; GCN-NEXT: v_mov_b32_e32 v7, s19 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0xc0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 @@ -42,14 +34,22 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:20 ; GCN-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:24 ; GCN-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:28 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:36 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:40 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:44 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:48 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:52 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:56 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:60 +; GCN-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:32 +; GCN-NEXT: v_mov_b32_e32 v0, s21 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:36 +; GCN-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:40 +; GCN-NEXT: v_mov_b32_e32 v0, s23 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:44 +; GCN-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:48 +; GCN-NEXT: v_mov_b32_e32 v0, s25 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:52 +; GCN-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:56 +; GCN-NEXT: v_mov_b32_e32 v0, s27 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:60 ; GCN-NEXT: v_mov_b32_e32 v0, s52 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64 ; GCN-NEXT: v_mov_b32_e32 v0, s53 @@ -82,74 +82,74 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:120 ; GCN-NEXT: v_mov_b32_e32 v0, s67 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:124 -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 -; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s37 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:132 -; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s38 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:136 -; GCN-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s39 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:140 -; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s40 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:144 -; GCN-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s41 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:148 -; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s42 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:152 -; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mov_b32_e32 v0, s43 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:156 -; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: v_mov_b32_e32 v0, s44 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:160 -; GCN-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NEXT: v_mov_b32_e32 v0, s45 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:164 -; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s46 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:168 -; GCN-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NEXT: v_mov_b32_e32 v0, s47 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:172 -; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: v_mov_b32_e32 v0, s48 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:176 -; GCN-NEXT: v_mov_b32_e32 v0, s17 +; GCN-NEXT: v_mov_b32_e32 v0, s49 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:180 -; GCN-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NEXT: v_mov_b32_e32 v0, s50 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:184 -; GCN-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NEXT: v_mov_b32_e32 v0, s51 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:188 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:192 -; GCN-NEXT: v_mov_b32_e32 v0, s37 +; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:196 -; GCN-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:200 -; GCN-NEXT: v_mov_b32_e32 v0, s39 +; GCN-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:204 -; GCN-NEXT: v_mov_b32_e32 v0, s40 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:208 -; GCN-NEXT: v_mov_b32_e32 v0, s41 +; GCN-NEXT: v_mov_b32_e32 v0, s9 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:212 -; GCN-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:216 -; GCN-NEXT: v_mov_b32_e32 v0, s43 +; GCN-NEXT: v_mov_b32_e32 v0, s11 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:220 -; GCN-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:224 -; GCN-NEXT: v_mov_b32_e32 v0, s45 +; GCN-NEXT: v_mov_b32_e32 v0, s13 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:228 -; GCN-NEXT: v_mov_b32_e32 v0, s46 +; GCN-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:232 -; GCN-NEXT: v_mov_b32_e32 v0, s47 +; GCN-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:236 -; GCN-NEXT: v_mov_b32_e32 v0, s48 +; GCN-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:240 -; GCN-NEXT: v_mov_b32_e32 v0, s49 +; GCN-NEXT: v_mov_b32_e32 v0, s17 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:244 -; GCN-NEXT: v_mov_b32_e32 v0, s50 -; GCN-NEXT: s_and_b32 s4, s25, 63 +; GCN-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NEXT: s_and_b32 s4, s35, 63 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:248 -; GCN-NEXT: v_mov_b32_e32 v0, s51 +; GCN-NEXT: v_mov_b32_e32 v0, s19 ; GCN-NEXT: s_lshl_b32 s4, s4, 2 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:252 -; GCN-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 @@ -218,37 +218,37 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN-NEXT: buffer_load_dword v62, off, s[0:3], 0 offset:248 ; GCN-NEXT: buffer_load_dword v63, off, s[0:3], 0 offset:252 ; GCN-NEXT: s_waitcnt vmcnt(60) -; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[20:21] +; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[28:29] ; GCN-NEXT: s_waitcnt vmcnt(57) -; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[20:21] offset:16 +; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[28:29] offset:16 ; GCN-NEXT: s_waitcnt vmcnt(54) -; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[20:21] offset:32 +; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[28:29] offset:32 ; GCN-NEXT: s_waitcnt vmcnt(51) -; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[20:21] offset:48 +; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[28:29] offset:48 ; GCN-NEXT: s_waitcnt vmcnt(48) -; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[20:21] offset:64 +; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[28:29] offset:64 ; GCN-NEXT: s_waitcnt vmcnt(45) -; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[20:21] offset:80 +; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[28:29] offset:80 ; GCN-NEXT: s_waitcnt vmcnt(42) -; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[20:21] offset:96 +; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[28:29] offset:96 ; GCN-NEXT: s_waitcnt vmcnt(39) -; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[20:21] offset:112 +; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[28:29] offset:112 ; GCN-NEXT: s_waitcnt vmcnt(36) -; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[20:21] offset:128 +; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[28:29] offset:128 ; GCN-NEXT: s_waitcnt vmcnt(33) -; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[20:21] offset:144 +; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[28:29] offset:144 ; GCN-NEXT: s_waitcnt vmcnt(30) -; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[20:21] offset:160 +; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[28:29] offset:160 ; GCN-NEXT: s_waitcnt vmcnt(27) -; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[20:21] offset:176 +; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[28:29] offset:176 ; GCN-NEXT: s_waitcnt vmcnt(24) -; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[20:21] offset:192 +; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[28:29] offset:192 ; GCN-NEXT: s_waitcnt vmcnt(21) -; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[20:21] offset:208 +; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[28:29] offset:208 ; GCN-NEXT: s_waitcnt vmcnt(18) -; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[20:21] offset:224 +; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[28:29] offset:224 ; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[20:21] offset:240 +; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[28:29] offset:240 ; GCN-NEXT: s_endpgm %vec = load <64 x i32>, ptr addrspace(1) %ptr %insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 533b25ef1a0c0..5a4c3ad398acf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -701,44 +701,44 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { ; GPRIDX-NEXT: s_mov_b64 s[6:7], 2.0 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s5 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v5, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v6, s7 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s10 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s11 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v11, s12 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s13 +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s15 +; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v12, v12, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s16 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s17 +; GPRIDX-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v14, v14, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 ; GPRIDX-NEXT: v_mov_b32_e32 v18, s19 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 2, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 3, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 4, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 5, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 6, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[14:15], 7, v2 -; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[16:17] -; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v1, s[16:17] -; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v0, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[12:13] -; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v17, v0, s[14:15] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v1, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[12:13] -; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[14:15] +; GPRIDX-NEXT: v_cndmask_b32_e32 v15, v15, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v17, v17, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v18, v18, v1, vcc ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off @@ -901,47 +901,47 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s14, s16 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v10, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v9, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v8, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v7, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v4, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s19 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v0, s19 -; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[12:13] +; GPRIDX-NEXT: v_mov_b32_e32 v6, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s4 ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[12:13] -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v17, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v17, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v17, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v0, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v0, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v18, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v18, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v18, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s10 +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v18, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v12, v12, v18, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 +; GPRIDX-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v14, v14, v18, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[1:4], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[5:8], off @@ -1320,45 +1320,45 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s14, s16 ; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v2 -; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[12:13] +; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v1, s[12:13] ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v17, v0, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v1, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[10:11] +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 +; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v12, v12, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v14, v14, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 +; GPRIDX-NEXT: v_cndmask_b32_e32 v15, v15, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v17, v17, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v18, v18, v1, vcc ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off @@ -1509,8 +1509,8 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_s_v: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: v_mov_b32_e32 v17, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v18, s3 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s3 ; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 @@ -5651,54 +5651,54 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v5, v0, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v0, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v11, v0, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v13, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v15, v0, s[8:9] ; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v1, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v14, v1, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v16, v1, s[8:9] +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 +; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v12, v12, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v14, v14, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v15, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v3 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v4 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v5 ; GPRIDX-NEXT: v_readfirstlane_b32 s3, v6 -; GPRIDX-NEXT: v_readfirstlane_b32 s4, v5 +; GPRIDX-NEXT: v_readfirstlane_b32 s4, v7 ; GPRIDX-NEXT: v_readfirstlane_b32 s5, v8 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v7 +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v9 ; GPRIDX-NEXT: v_readfirstlane_b32 s7, v10 -; GPRIDX-NEXT: v_readfirstlane_b32 s8, v9 +; GPRIDX-NEXT: v_readfirstlane_b32 s8, v11 ; GPRIDX-NEXT: v_readfirstlane_b32 s9, v12 -; GPRIDX-NEXT: v_readfirstlane_b32 s10, v11 -; GPRIDX-NEXT: v_readfirstlane_b32 s11, v13 +; GPRIDX-NEXT: v_readfirstlane_b32 s10, v13 +; GPRIDX-NEXT: v_readfirstlane_b32 s11, v14 ; GPRIDX-NEXT: v_readfirstlane_b32 s12, v0 ; GPRIDX-NEXT: v_readfirstlane_b32 s13, v1 ; GPRIDX-NEXT: ; return to shader part epilog @@ -6209,17 +6209,17 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 4, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 ; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v1, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v11, v0, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v12, v1, vcc ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v3 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v4 ; GPRIDX-NEXT: v_readfirstlane_b32 s2, v5 @@ -6227,7 +6227,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg ; GPRIDX-NEXT: v_readfirstlane_b32 s4, v7 ; GPRIDX-NEXT: v_readfirstlane_b32 s5, v8 ; GPRIDX-NEXT: v_readfirstlane_b32 s6, v9 -; GPRIDX-NEXT: v_readfirstlane_b32 s7, v2 +; GPRIDX-NEXT: v_readfirstlane_b32 s7, v10 ; GPRIDX-NEXT: v_readfirstlane_b32 s8, v0 ; GPRIDX-NEXT: v_readfirstlane_b32 s9, v1 ; GPRIDX-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll index 4361e5c113708..08b50c9a166e6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll @@ -774,54 +774,53 @@ define amdgpu_ps void @load_uniform_P1_v16i32(ptr addrspace(1) inreg %ptra, ptr ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32 ; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48 -; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: buffer_load_dwordx4 v[18:21], off, s[0:3], 0 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7-NEXT: v_readfirstlane_b32 s5, v3 ; GFX7-NEXT: v_readfirstlane_b32 s6, v4 ; GFX7-NEXT: v_readfirstlane_b32 s7, v5 -; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s8, v6 ; GFX7-NEXT: v_readfirstlane_b32 s9, v7 +; GFX7-NEXT: buffer_load_dwordx4 v[2:5], off, s[0:3], 0 offset:16 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s10, v8 ; GFX7-NEXT: v_readfirstlane_b32 s11, v9 -; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:16 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s12, v10 ; GFX7-NEXT: v_readfirstlane_b32 s13, v11 +; GFX7-NEXT: buffer_load_dwordx4 v[6:9], off, s[0:3], 0 offset:32 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s14, v12 ; GFX7-NEXT: v_readfirstlane_b32 s15, v13 -; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32 glc +; GFX7-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:48 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s20, v18 ; GFX7-NEXT: v_readfirstlane_b32 s16, v14 +; GFX7-NEXT: v_readfirstlane_b32 s21, v19 +; GFX7-NEXT: v_readfirstlane_b32 s22, v20 +; GFX7-NEXT: v_readfirstlane_b32 s23, v21 +; GFX7-NEXT: s_add_i32 s4, s4, s20 ; GFX7-NEXT: v_readfirstlane_b32 s17, v15 ; GFX7-NEXT: v_readfirstlane_b32 s18, v16 ; GFX7-NEXT: v_readfirstlane_b32 s19, v17 -; GFX7-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 offset:48 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: v_readfirstlane_b32 s20, v2 -; GFX7-NEXT: v_readfirstlane_b32 s21, v3 -; GFX7-NEXT: v_readfirstlane_b32 s22, v4 -; GFX7-NEXT: v_readfirstlane_b32 s23, v5 -; GFX7-NEXT: s_add_i32 s4, s4, s20 -; GFX7-NEXT: v_readfirstlane_b32 s24, v6 -; GFX7-NEXT: v_readfirstlane_b32 s25, v7 -; GFX7-NEXT: v_readfirstlane_b32 s26, v8 -; GFX7-NEXT: v_readfirstlane_b32 s27, v9 ; GFX7-NEXT: s_add_i32 s5, s5, s21 -; GFX7-NEXT: v_readfirstlane_b32 s28, v10 -; GFX7-NEXT: v_readfirstlane_b32 s29, v11 -; GFX7-NEXT: v_readfirstlane_b32 s30, v12 -; GFX7-NEXT: v_readfirstlane_b32 s31, v13 ; GFX7-NEXT: s_add_i32 s6, s6, s22 -; GFX7-NEXT: v_readfirstlane_b32 s33, v14 -; GFX7-NEXT: v_readfirstlane_b32 s34, v15 -; GFX7-NEXT: v_readfirstlane_b32 s35, v16 -; GFX7-NEXT: v_readfirstlane_b32 s36, v17 ; GFX7-NEXT: s_add_i32 s7, s7, s23 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_readfirstlane_b32 s24, v2 +; GFX7-NEXT: v_readfirstlane_b32 s25, v3 +; GFX7-NEXT: v_readfirstlane_b32 s26, v4 +; GFX7-NEXT: v_readfirstlane_b32 s27, v5 ; GFX7-NEXT: s_add_i32 s8, s8, s24 +; GFX7-NEXT: v_readfirstlane_b32 s28, v6 +; GFX7-NEXT: v_readfirstlane_b32 s29, v7 +; GFX7-NEXT: v_readfirstlane_b32 s30, v8 +; GFX7-NEXT: v_readfirstlane_b32 s33, v10 +; GFX7-NEXT: v_readfirstlane_b32 s31, v9 +; GFX7-NEXT: v_readfirstlane_b32 s34, v11 +; GFX7-NEXT: v_readfirstlane_b32 s35, v12 +; GFX7-NEXT: v_readfirstlane_b32 s36, v13 ; GFX7-NEXT: s_add_i32 s12, s12, s28 ; GFX7-NEXT: s_add_i32 s16, s16, s33 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 3eecaccf0308f..418268513a201 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -2403,204 +2403,210 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7-LABEL: v_mul_i256: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX7-NEXT: v_mul_lo_u32 v29, v3, v12 -; GFX7-NEXT: v_mul_lo_u32 v30, v2, v13 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] -; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] -; GFX7-NEXT: v_addc_u32_e32 v28, vcc, 0, v20, vcc -; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] -; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GFX7-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] -; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] -; GFX7-NEXT: v_addc_u32_e32 v16, vcc, 0, v20, vcc -; GFX7-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] -; GFX7-NEXT: v_mov_b32_e32 v22, v26 -; GFX7-NEXT: v_addc_u32_e32 v23, vcc, 0, v16, vcc -; GFX7-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] -; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] -; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] -; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] +; GFX7-NEXT: v_mov_b32_e32 v16, v0 +; GFX7-NEXT: v_mov_b32_e32 v17, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v14, 0 +; GFX7-NEXT: v_mov_b32_e32 v18, v3 +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v13, v[1:2] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v12, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v18, v11, v[1:2] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v10, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v16, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[40:41], v17, v11, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v5, v9, v[1:2] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], vcc, v0, v10, v[21:22] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v16, v10, 0 +; GFX7-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v18, v9, v[1:2] +; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[42:43], v17, v9, v[21:22] +; GFX7-NEXT: v_mad_u64_u32 v[27:28], s[6:7], v6, v8, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v4, v8, v[25:26] +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v0, v8, v[23:24] +; GFX7-NEXT: v_mov_b32_e32 v22, v27 +; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[10:11], v16, v13, v[21:22] +; GFX7-NEXT: v_mov_b32_e32 v19, v3 +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[44:45], v16, v11, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v17, v12, v[23:24] +; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[14:15], v17, v10, v[21:22] +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[16:17], v0, v11, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v0, v9, v[23:24] +; GFX7-NEXT: v_mul_lo_u32 v13, v0, v13 +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[20:21], v18, v10, v[21:22] +; GFX7-NEXT: v_mul_lo_u32 v23, v4, v11 +; GFX7-NEXT: v_mul_lo_u32 v25, v18, v12 +; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[22:23], v4, v9, v[0:1] +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[24:25], v16, v8, 0 +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[24:25], v18, v8, v[19:20] +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[28:29], v16, v9, v[1:2] ; GFX7-NEXT: v_mul_lo_u32 v24, v6, v9 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] -; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] -; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] -; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] -; GFX7-NEXT: v_mul_lo_u32 v10, v1, v14 -; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 -; GFX7-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] -; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] -; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], 0, v2, s[12:13] -; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] -; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] -; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v4, v21, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v28, v22, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v5, v13, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v23, v14, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v27, v0, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v10, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v30, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v29, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v24, vcc -; GFX7-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v0, v16 -; GFX7-NEXT: v_mov_b32_e32 v1, v11 -; GFX7-NEXT: v_mov_b32_e32 v2, v12 +; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[40:41] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[26:27], v17, v8, v[3:4] +; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[28:29] +; GFX7-NEXT: v_addc_u32_e64 v3, s[26:27], v3, v11, s[26:27] +; GFX7-NEXT: v_mul_lo_u32 v11, v17, v14 +; GFX7-NEXT: v_mul_lo_u32 v14, v5, v10 +; GFX7-NEXT: v_mad_u64_u32 v[9:10], s[28:29], v5, v8, v[21:22] +; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[44:45] +; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[42:43] +; GFX7-NEXT: v_addc_u32_e64 v5, s[14:15], 0, v5, s[14:15] +; GFX7-NEXT: v_addc_u32_e64 v4, s[8:9], 0, v4, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v5, s[14:15], 0, v5, s[18:19] +; GFX7-NEXT: v_addc_u32_e64 v4, s[8:9], v4, v12, s[26:27] +; GFX7-NEXT: v_addc_u32_e64 v5, s[14:15], 0, v5, s[24:25] +; GFX7-NEXT: v_addc_u32_e64 v5, s[8:9], v5, v9, s[8:9] +; GFX7-NEXT: v_mul_lo_u32 v9, v16, v15 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GFX7-NEXT: v_addc_u32_e64 v6, vcc, 0, v6, s[4:5] +; GFX7-NEXT: v_addc_u32_e64 v6, vcc, 0, v6, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v6, vcc, v6, v10, s[8:9] +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, v28, v9, vcc +; GFX7-NEXT: v_addc_u32_e64 v9, vcc, v9, v11, s[28:29] +; GFX7-NEXT: v_addc_u32_e64 v9, vcc, v9, v13, s[22:23] +; GFX7-NEXT: v_addc_u32_e64 v9, vcc, v9, v25, s[20:21] +; GFX7-NEXT: v_addc_u32_e64 v9, vcc, v9, v23, s[16:17] +; GFX7-NEXT: v_addc_u32_e64 v9, vcc, v9, v14, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v11, vcc, v9, v24, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[11:12] ; GFX7-NEXT: v_mov_b32_e32 v7, v9 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX8-NEXT: v_mul_lo_u32 v29, v3, v12 -; GFX8-NEXT: v_mul_lo_u32 v30, v2, v13 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] -; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] -; GFX8-NEXT: v_addc_u32_e32 v28, vcc, 0, v20, vcc -; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] -; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc -; GFX8-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] -; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v20, vcc -; GFX8-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] -; GFX8-NEXT: v_mov_b32_e32 v22, v26 -; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v16, vcc -; GFX8-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] -; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] -; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] -; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] +; GFX8-NEXT: v_mov_b32_e32 v16, v0 +; GFX8-NEXT: v_mov_b32_e32 v17, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v14, 0 +; GFX8-NEXT: v_mov_b32_e32 v18, v3 +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v13, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v12, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v18, v11, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v10, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v16, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[40:41], v17, v11, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v5, v9, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], vcc, v0, v10, v[21:22] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v16, v10, 0 +; GFX8-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v18, v9, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[42:43], v17, v9, v[21:22] +; GFX8-NEXT: v_mad_u64_u32 v[27:28], s[6:7], v6, v8, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v4, v8, v[25:26] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v0, v8, v[23:24] +; GFX8-NEXT: v_mov_b32_e32 v22, v27 +; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[10:11], v16, v13, v[21:22] +; GFX8-NEXT: v_mov_b32_e32 v19, v3 +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[44:45], v16, v11, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v17, v12, v[23:24] +; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[14:15], v17, v10, v[21:22] +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[16:17], v0, v11, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v0, v9, v[23:24] +; GFX8-NEXT: v_mul_lo_u32 v13, v0, v13 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[20:21], v18, v10, v[21:22] +; GFX8-NEXT: v_mul_lo_u32 v23, v4, v11 +; GFX8-NEXT: v_mul_lo_u32 v25, v18, v12 +; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[22:23], v4, v9, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[24:25], v16, v8, 0 +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[24:25], v18, v8, v[19:20] +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[28:29], v16, v9, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v24, v6, v9 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] -; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] -; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] -; GFX8-NEXT: v_mul_lo_u32 v10, v1, v14 -; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 -; GFX8-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] -; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] -; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], 0, v2, s[12:13] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] -; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v4, v21, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v28, v22, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v5, v13, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v23, v14, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v27, v0, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v10, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v30, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v29, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v24, vcc -; GFX8-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, v16 -; GFX8-NEXT: v_mov_b32_e32 v1, v11 -; GFX8-NEXT: v_mov_b32_e32 v2, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[40:41] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[26:27], v17, v8, v[3:4] +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[28:29] +; GFX8-NEXT: v_addc_u32_e64 v3, s[26:27], v3, v11, s[26:27] +; GFX8-NEXT: v_mul_lo_u32 v11, v17, v14 +; GFX8-NEXT: v_mul_lo_u32 v14, v5, v10 +; GFX8-NEXT: v_mad_u64_u32 v[9:10], s[28:29], v5, v8, v[21:22] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[44:45] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[42:43] +; GFX8-NEXT: v_addc_u32_e64 v5, s[14:15], 0, v5, s[14:15] +; GFX8-NEXT: v_addc_u32_e64 v4, s[8:9], 0, v4, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v5, s[14:15], 0, v5, s[18:19] +; GFX8-NEXT: v_addc_u32_e64 v4, s[8:9], v4, v12, s[26:27] +; GFX8-NEXT: v_addc_u32_e64 v5, s[14:15], 0, v5, s[24:25] +; GFX8-NEXT: v_addc_u32_e64 v5, s[8:9], v5, v9, s[8:9] +; GFX8-NEXT: v_mul_lo_u32 v9, v16, v15 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GFX8-NEXT: v_addc_u32_e64 v6, vcc, 0, v6, s[4:5] +; GFX8-NEXT: v_addc_u32_e64 v6, vcc, 0, v6, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v6, vcc, v6, v10, s[8:9] +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v28, v9, vcc +; GFX8-NEXT: v_addc_u32_e64 v9, vcc, v9, v11, s[28:29] +; GFX8-NEXT: v_addc_u32_e64 v9, vcc, v9, v13, s[22:23] +; GFX8-NEXT: v_addc_u32_e64 v9, vcc, v9, v25, s[20:21] +; GFX8-NEXT: v_addc_u32_e64 v9, vcc, v9, v23, s[16:17] +; GFX8-NEXT: v_addc_u32_e64 v9, vcc, v9, v14, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v11, vcc, v9, v24, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[11:12] ; GFX8-NEXT: v_mov_b32_e32 v7, v9 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX9-NEXT: v_mul_lo_u32 v29, v3, v12 -; GFX9-NEXT: v_mul_lo_u32 v30, v2, v13 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v3, v11, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v9, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v4, v10, v[20:21] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v5, v9, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[24:25], s[6:7], v1, v11, v[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[16:17] -; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, 0, v20, vcc -; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v2, v10, v[24:25] -; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc -; GFX9-NEXT: v_mad_u64_u32 v[26:27], s[8:9], v6, v8, v[22:23] -; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v3, v9, v[16:17] -; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v20, vcc -; GFX9-NEXT: v_mad_u64_u32 v[20:21], vcc, v4, v8, v[22:23] -; GFX9-NEXT: v_mov_b32_e32 v22, v26 -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v16, vcc -; GFX9-NEXT: v_mad_u64_u32 v[16:17], vcc, v0, v13, v[21:22] -; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v0, v11, v[19:20] -; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v1, v12, v[16:17] -; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v2, v11, v[19:20] +; GFX9-NEXT: v_mov_b32_e32 v16, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v14, 0 +; GFX9-NEXT: v_mov_b32_e32 v18, v3 +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v13, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v12, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v18, v11, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v10, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v16, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[40:41], v17, v11, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v5, v9, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], vcc, v0, v10, v[21:22] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v16, v10, 0 +; GFX9-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v18, v9, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[42:43], v17, v9, v[21:22] +; GFX9-NEXT: v_mad_u64_u32 v[27:28], s[6:7], v6, v8, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v4, v8, v[25:26] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v0, v8, v[23:24] +; GFX9-NEXT: v_mov_b32_e32 v22, v27 +; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[10:11], v16, v13, v[21:22] +; GFX9-NEXT: v_mov_b32_e32 v19, v3 +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[44:45], v16, v11, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v17, v12, v[23:24] +; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[14:15], v17, v10, v[21:22] +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[16:17], v0, v11, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[18:19], v0, v9, v[23:24] +; GFX9-NEXT: v_mul_lo_u32 v13, v0, v13 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[20:21], v18, v10, v[21:22] +; GFX9-NEXT: v_mul_lo_u32 v23, v4, v11 +; GFX9-NEXT: v_mul_lo_u32 v25, v18, v12 +; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[22:23], v4, v9, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[24:25], v16, v8, 0 +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[24:25], v18, v8, v[19:20] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[28:29], v16, v9, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v24, v6, v9 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v4, v9, v[11:12] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v10, v[21:22] -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13] -; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[11:12] -; GFX9-NEXT: v_mul_lo_u32 v10, v1, v14 -; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[14:15], v5, v8, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0 -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13] -; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[12:13], v3, v8, v[19:20] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], 0, v2, s[12:13] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v0, v9, v[17:18] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] -; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v1, v8, v[2:3] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v4, v21, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v28, v22, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v5, v13, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v23, v14, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v27, v0, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v0, v10, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v30, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v29, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v24, vcc -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, v16 -; GFX9-NEXT: v_mov_b32_e32 v1, v11 -; GFX9-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[40:41] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[26:27], v17, v8, v[3:4] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[28:29] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[26:27], v3, v11, s[26:27] +; GFX9-NEXT: v_mul_lo_u32 v11, v17, v14 +; GFX9-NEXT: v_mul_lo_u32 v14, v5, v10 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[28:29], v5, v8, v[21:22] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[44:45] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[42:43] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[14:15], 0, v5, s[14:15] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[8:9], 0, v4, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[14:15], 0, v5, s[18:19] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[8:9], v4, v12, s[26:27] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[14:15], 0, v5, s[24:25] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[8:9], v5, v9, s[8:9] +; GFX9-NEXT: v_mul_lo_u32 v9, v16, v15 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, 0, v6, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, 0, v6, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, v6, v10, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v28, v9, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v9, vcc, v9, v11, s[28:29] +; GFX9-NEXT: v_addc_co_u32_e64 v9, vcc, v9, v13, s[22:23] +; GFX9-NEXT: v_addc_co_u32_e64 v9, vcc, v9, v25, s[20:21] +; GFX9-NEXT: v_addc_co_u32_e64 v9, vcc, v9, v23, s[16:17] +; GFX9-NEXT: v_addc_co_u32_e64 v9, vcc, v9, v14, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v11, vcc, v9, v24, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[11:12] ; GFX9-NEXT: v_mov_b32_e32 v7, v9 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index b7c84f1389197..bd540a7873e6c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -377,183 +377,183 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc ; GISEL-NEXT: v_xor_b32_e32 v10, v4, v8 -; GISEL-NEXT: v_xor_b32_e32 v9, v5, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v9 +; GISEL-NEXT: v_xor_b32_e32 v4, v5, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 ; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v10 -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v9, vcc -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v5 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, 0 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v16, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v4 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v4, vcc +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v5, 0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v9, v[12:13] +; GISEL-NEXT: v_mul_lo_u32 v12, v9, v11 +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v18, v5, v[13:14] +; GISEL-NEXT: v_mul_hi_u32 v14, v5, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v5, v15 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v15 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v13, v5, v15 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v9, v15 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v16, v13 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v12, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v5, 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v19, v[12:13] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v18, v5, v[13:14] +; GISEL-NEXT: v_xor_b32_e32 v16, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v15 +; GISEL-NEXT: v_xor_b32_e32 v17, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v5, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v17, v15, 0 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, v[5:6] -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v17, v0, v5 -; GISEL-NEXT: v_mul_lo_u32 v0, v16, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 -; GISEL-NEXT: v_xor_b32_e32 v18, v1, v5 -; GISEL-NEXT: v_mul_hi_u32 v1, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v13 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v19, v15 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v12, v5, v15 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v19, v15 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v17, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v16, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v18, v1 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v1 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v17, v1 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v18, v1 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v1 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v15, 0 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v11, v4 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v16, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v17, v0 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v15, v[11:12] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc -; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v18, v13 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v9 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v11, v5 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v18, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[11:12] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v17, v13, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v17, v13 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v9 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v4, v9, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v11, v12, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v7, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v7, v6, v5 +; GISEL-NEXT: v_xor_b32_e32 v6, v11, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v15 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v9 -; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v4 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v7, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v7, v6, v4 -; GISEL-NEXT: v_xor_b32_e32 v6, v17, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v17, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v18, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v11 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v9 -; GISEL-NEXT: v_mac_f32_e32 v17, 0x4f800000, v18 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v17 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v14, v0, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v12 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v1, v1 ; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 -; GISEL-NEXT: v_sub_i32_e64 v19, s[4:5], 0, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 -; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, v6, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v14, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v18, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[9:10] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v18, v0 -; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v21, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v22 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v16, 0 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v11, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v17, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v18, vcc +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v16, v[10:11] +; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v21, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v21, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v16, v12 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v15, v1, vcc +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v0 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 -; GISEL-NEXT: v_mul_hi_u32 v9, v17, v0 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v17, v12 +; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v11, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v16, v12 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v10, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v9 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v17, v1 -; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v18, v0, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v11, v1 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v16, v0 +; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v17, v1, s[4:5] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v12, 0 +; GISEL-NEXT: v_xor_b32_e32 v15, v9, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v18, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v15 +; GISEL-NEXT: v_xor_b32_e32 v1, v13, v15 +; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 ; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v12, v[8:9] -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v15, vcc -; GISEL-NEXT: v_xor_b32_e32 v16, v2, v15 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v13 ; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 ; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10 -; GISEL-NEXT: v_xor_b32_e32 v17, v3, v15 +; GISEL-NEXT: v_xor_b32_e32 v16, v3, v13 ; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 @@ -575,62 +575,62 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v17, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v16, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v16, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 -; GISEL-NEXT: v_xor_b32_e32 v10, v13, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_xor_b32_e32 v4, v4, v15 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v17, v2 +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_mul_hi_u32 v8, v16, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v17, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v13, v[3:4] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v5 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v12, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v16, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v10, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v17, v10 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6 -; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v6, vcc +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v8, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v12, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v15 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v4, v15, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v10, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v6 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v8 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v15, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v13, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -1170,7 +1170,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; GISEL-NEXT: v_mov_b32_e32 v7, 0xffed2705 ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 @@ -1180,78 +1180,75 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GISEL-NEXT: v_trunc_f32_e32 v5, v5 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] -; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v7, v9, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v6, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] -; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 -; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 -; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] -; GISEL-NEXT: s_mov_b32 s6, 1 -; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v19, 0 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v9, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v7, v6, v[14:15] +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], s6, v19, v[15:16] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v6, v13 +; GISEL-NEXT: v_mul_lo_u32 v14, v19, v17 +; GISEL-NEXT: v_xor_b32_e32 v20, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v6, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v6, v17 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v14, v19, v17 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v17 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0 ; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 ; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 -; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb +; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 +; GISEL-NEXT: v_mov_b32_e32 v6, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 @@ -1259,147 +1256,150 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v20, vcc, v1, v0 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v20, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v14, v20, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v17, 0 +; GISEL-NEXT: s_mov_b32 s6, 1 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v14, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v19, v[1:2] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 ; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v15, vcc -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v15 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v20, v15, vcc +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v20, v15 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v17 -; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v20, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v19, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v19, -1, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v9, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[0:1] ; GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v16 -; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v18, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; GISEL-NEXT: v_cndmask_b32_e32 v14, v16, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v16, v18, v1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v1, v8, v13 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v0 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v9, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v5, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9 -; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] -; GISEL-NEXT: v_cndmask_b32_e32 v12, v20, v16, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc -; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 -; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 -; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v17, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v12, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v14, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], s6, v5, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v13, v19, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v9, v2, v14 +; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v5, v10 +; GISEL-NEXT: v_xor_b32_e32 v11, v3, v14 +; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 +; GISEL-NEXT: v_xor_b32_e32 v10, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[3:4] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v5, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v12, v[7:8] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v11, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v7, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v12 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v7 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v14 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v14 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: @@ -1415,195 +1415,195 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_trunc_f32_e32 v5, v5 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] -; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] -; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v12, v9, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] +; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v8, v9, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] -; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 -; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, v9, v8, vcc +; CGP-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v19, v[14:15] +; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; CGP-NEXT: v_mad_u64_u32 v[17:18], s[4:5], -1, v4, v[15:16] +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc +; CGP-NEXT: v_xor_b32_e32 v15, v0, v8 +; CGP-NEXT: v_mul_lo_u32 v0, v19, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v4, v17 +; CGP-NEXT: v_xor_b32_e32 v16, v1, v8 +; CGP-NEXT: v_mul_hi_u32 v1, v4, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v19, v13 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v19, v17 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v17 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v19, v17 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v14, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v16, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v17, v15, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v16, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 +; CGP-NEXT: v_mul_lo_u32 v17, v16, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v17, v0 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v1, v19, v1 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; CGP-NEXT: v_add_i32_e32 v17, vcc, v0, v13 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v1, v0 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 -; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc -; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v0 +; CGP-NEXT: v_mul_hi_u32 v14, v16, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v14, v13 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v18, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v16, v13, vcc +; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v16, v13 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v18, vcc, 1, v15 -; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v16, vcc +; CGP-NEXT: v_add_i32_e32 v16, vcc, 1, v17 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v18, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v14, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5] ; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v18 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[5:6] +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v16 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] ; CGP-NEXT: v_addc_u32_e32 v21, vcc, 0, v19, vcc ; CGP-NEXT: v_mul_lo_u32 v1, v7, v13 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v16, v5, vcc ; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc -; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; CGP-NEXT: v_cndmask_b32_e32 v14, v17, v0, vcc +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v11 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_mul_lo_u32 v10, v9, v13 ; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 ; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v13 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v13 ; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 ; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v11, v1 +; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v0 +; CGP-NEXT: v_addc_u32_e64 v11, s[4:5], v9, v1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] -; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v11, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v5, v8 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[9:10] ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 -; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 -; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 +; CGP-NEXT: v_xor_b32_e32 v9, v2, v13 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 +; CGP-NEXT: v_xor_b32_e32 v10, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 -; CGP-NEXT: v_xor_b32_e32 v11, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v2 +; CGP-NEXT: v_xor_b32_e32 v12, v14, v8 +; CGP-NEXT: v_mul_hi_u32 v7, v9, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v12, v8 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v7, v10, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v9, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v3 -; CGP-NEXT: v_mul_hi_u32 v6, v8, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v6, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v10, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v6, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v10, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v10, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -1611,20 +1611,20 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 @@ -1824,268 +1824,268 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-LABEL: v_sdiv_v2i64_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v12, 0x1000 -; GISEL-NEXT: v_mov_b32_e32 v13, 0 -; GISEL-NEXT: v_lshl_b64 v[7:8], v[12:13], v4 +; GISEL-NEXT: v_mov_b32_e32 v11, 0x1000 +; GISEL-NEXT: v_mov_b32_e32 v12, 0 +; GISEL-NEXT: v_lshl_b64 v[7:8], v[11:12], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v4 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v4, vcc ; GISEL-NEXT: v_xor_b32_e32 v8, v5, v4 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v8 -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v7, vcc -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 +; GISEL-NEXT: v_xor_b32_e32 v5, v7, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, 0, v8 +; GISEL-NEXT: v_subb_u32_e32 v19, vcc, 0, v5, vcc +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v5, 0 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v11, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 -; GISEL-NEXT: v_mul_hi_u32 v19, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v5, v[14:15] -; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v17, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v10, v17, v9 +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v19, v7, v[13:14] +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v15 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v17, v15 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v15 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v19 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v17, v15 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_mul_hi_u32 v14, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v20 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v11, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v5, 0 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v19, v[11:12] -; GISEL-NEXT: v_mul_lo_u32 v9, v19, v10 -; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], v6 -; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v5, v[14:15] -; GISEL-NEXT: v_mul_hi_u32 v14, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v16 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v9 -; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v1, v9, s[4:5] -; GISEL-NEXT: v_xor_b32_e32 v14, v0, v9 -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v9 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v7, v9 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v20, 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v17, v[10:11] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v19, v20, v[13:14] +; GISEL-NEXT: v_xor_b32_e32 v16, v0, v7 +; GISEL-NEXT: v_mul_lo_u32 v0, v17, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v20, v15 +; GISEL-NEXT: v_xor_b32_e32 v18, v1, v7 +; GISEL-NEXT: v_mul_hi_u32 v1, v20, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v1, v19, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v19, v16 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v5, v16 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v15 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v20, v15 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v19, v16 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v17, v15 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_xor_b32_e32 v4, v9, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v20, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v16, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v16, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_lshl_b64 v[11:12], v[11:12], v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v18, v1 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v16, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v16, 0 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v10, v5 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v17, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[10:11] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v15, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v18, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v15, 0 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v10, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v17, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v5, v15, v[9:10] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc +; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v18, v13 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v7 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v7, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v6, v5, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v16, v9, v10, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v6 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v12, v6, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v9, v6 +; GISEL-NEXT: v_xor_b32_e32 v9, v11, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v9 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v16 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v6, v10, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v5 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v13, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v6, v5 -; GISEL-NEXT: v_xor_b32_e32 v6, v12, v5 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v11 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7 -; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v13 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v14 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v15, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v1, v1 ; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v14, v7, vcc -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0 -; GISEL-NEXT: v_sub_i32_e64 v19, s[4:5], 0, v10 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 -; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, v6, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v14, 0 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v19, v18, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v22, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v14, v[7:8] -; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v13, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v18, v0 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v21, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v22 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v10 +; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v18, 0 +; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v19, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v17, vcc +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v21, v18, v[11:12] +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v1 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v19, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v18, v13 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v15, v1, vcc +; GISEL-NEXT: v_mul_hi_u32 v1, v18, v0 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v8, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v7, v1 -; GISEL-NEXT: v_mul_hi_u32 v7, v14, v0 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_mul_lo_u32 v8, v19, v13 +; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v11, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v18, v13 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v8, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v7, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v7 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v1 -; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v18, v0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v19, v14, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v13, v[7:8] -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v13, v11 -; GISEL-NEXT: v_xor_b32_e32 v16, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v13 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v8, v1 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v11, v1 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v18, v0 +; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], v19, v1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v11, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v17, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v15, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v13, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v12, v15 +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v21, v11, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v16, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v4, v11, v7 +; GISEL-NEXT: v_xor_b32_e32 v17, v3, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v11 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_xor_b32_e32 v11, v15, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v4, v16, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v16, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v14, v15 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v16, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3 +; GISEL-NEXT: v_mul_lo_u32 v5, v17, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v7, v16, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v14, v[3:4] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v11, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v13, v[7:8] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v17, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v4, v0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v13, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v15 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v15, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v9, v11, v[4:5] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v16, v2 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v17, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v9 +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v13 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v14, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v7, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v7 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v5 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v13, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v9, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v12, v6 ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 9d6ffc9bbc0dc..6a7d9e1950e43 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1551,36 +1551,36 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1] ; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s2, v2 ; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v7, vcc +; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v10 +; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v10 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc -; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v11, s[0:1] -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v14 -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v9 -; GFX8-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s2, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13 +; GFX8-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1] +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v10, v13, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v11, v14, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v2, v7, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] -; GFX8-NEXT: v_xor_b32_e32 v2, s0, v7 -; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8 -; GFX8-NEXT: v_mov_b32_e32 v7, s1 +; GFX8-NEXT: v_xor_b32_e32 v2, s0, v8 +; GFX8-NEXT: v_xor_b32_e32 v3, s1, v10 +; GFX8-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GFX8-NEXT: v_xor_b32_e32 v7, s6, v9 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 ; GFX8-NEXT: v_xor_b32_e32 v8, s6, v6 ; GFX8-NEXT: v_mov_b32_e32 v9, s6 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v7 @@ -1649,7 +1649,6 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] @@ -1689,209 +1688,210 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_lo_u32 v4, s11, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, s10, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v8, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] -; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v0 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v8, v4, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 -; GFX9-NEXT: v_sub_u32_e32 v0, s11, v4 +; GFX9-NEXT: v_add3_u32 v9, v3, v0, v5 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s8, v9, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, s10, v1 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s9, v8, v[3:4] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_subb_co_u32_e64 v11, s[0:1], v2, v5, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v11 +; GFX9-NEXT: v_sub_u32_e32 v1, s11, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s8, v9 -; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], 1, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[0:1], 0, v7, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s8, v10 +; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], 1, v8 +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v9, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 -; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s8, v3 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v5 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s8, v3 ; GFX9-NEXT: s_ashr_i32 s8, s3, 31 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v4 ; GFX9-NEXT: s_add_u32 s10, s18, s6 -; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1] +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v6, s[0:1] ; GFX9-NEXT: s_addc_u32 s11, s19, s6 ; GFX9-NEXT: s_add_u32 s0, s2, s8 ; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: s_addc_u32 s1, s3, s8 ; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], s[8:9] -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v16, s3 +; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v1, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mul_f32_e32 v12, 0x4f800000, v16 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: v_add_f32_e32 v1, v12, v1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v4, s[0:1] ; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[6:7] -; GFX9-NEXT: s_sub_u32 s5, 0, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 -; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_f32_e32 v4, 0xcf800000, v2 +; GFX9-NEXT: v_add_f32_e32 v1, v4, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[18:19], s5, v11, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v14, vcc +; GFX9-NEXT: s_sub_u32 s5, 0, s2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v7, vcc +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s5, v12, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc ; GFX9-NEXT: s_subb_u32 s20, 0, s3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v12, v[1:2] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v11, v[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v1, v12, v0 -; GFX9-NEXT: v_mul_lo_u32 v2, v11, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v11, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v12, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v12, v4 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mul_hi_u32 v2, v11, v4 -; GFX9-NEXT: v_xor_b32_e32 v6, s16, v6 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[18:19], s5, v13, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v15, vcc +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[18:19], s20, v12, v[3:4] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v2, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v2, v13, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, v12, v5 +; GFX9-NEXT: v_mul_hi_u32 v4, v12, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v7, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, v13, v5 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v12, v4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v0 -; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v11, v[3:4] -; GFX9-NEXT: v_xor_b32_e32 v1, s17, v7 -; GFX9-NEXT: v_mov_b32_e32 v7, s17 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s20, v10, v[4:5] -; GFX9-NEXT: v_mul_lo_u32 v4, v11, v2 -; GFX9-NEXT: v_xor_b32_e32 v3, s4, v9 -; GFX9-NEXT: v_mul_lo_u32 v7, v10, v6 -; GFX9-NEXT: v_mul_hi_u32 v9, v10, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2 -; GFX9-NEXT: v_xor_b32_e32 v5, s4, v8 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 +; GFX9-NEXT: v_mul_hi_u32 v3, v12, v5 +; GFX9-NEXT: v_xor_b32_e32 v7, s16, v8 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v4, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, v11, v6 -; GFX9-NEXT: v_add_u32_e32 v4, v7, v4 -; GFX9-NEXT: v_mul_hi_u32 v7, v10, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, v11, v6 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, v13, v5 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v1 +; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v12, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v2, vcc +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s5, v13, v[4:5] +; GFX9-NEXT: v_xor_b32_e32 v2, s17, v9 +; GFX9-NEXT: v_mov_b32_e32 v8, s17 +; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s16, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v8, vcc +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s20, v12, v[5:6] +; GFX9-NEXT: v_mul_lo_u32 v5, v13, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, s4, v10 +; GFX9-NEXT: v_mul_lo_u32 v8, v12, v7 +; GFX9-NEXT: v_mul_hi_u32 v10, v12, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, v13, v3 +; GFX9-NEXT: v_xor_b32_e32 v6, s4, v11 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v13, v7 +; GFX9-NEXT: v_add_u32_e32 v5, v8, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v12, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, v13, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_add_u32_e32 v8, v10, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v5, v8, v5, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v12, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, s11, v3 +; GFX9-NEXT: v_mul_lo_u32 v10, s10, v7 +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s4, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, s10, v3 +; GFX9-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v4, v7, v4, v6 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v11, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, s11, v2 -; GFX9-NEXT: v_mul_lo_u32 v9, s10, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, s10, v2 -; GFX9-NEXT: v_mov_b32_e32 v8, s4 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v8, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, s11, v6 -; GFX9-NEXT: v_mul_hi_u32 v2, s11, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v8, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, s10, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, s11, v6 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8 +; GFX9-NEXT: v_mul_lo_u32 v8, s11, v7 +; GFX9-NEXT: v_mul_hi_u32 v3, s11, v3 +; GFX9-NEXT: v_add_u32_e32 v4, v9, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, s10, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, s11, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 -; GFX9-NEXT: v_add3_u32 v11, v7, v9, v6 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v12, s11 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v2 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v12, v8, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 -; GFX9-NEXT: v_sub_u32_e32 v7, s11, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v3, v4 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v11, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v8, v8, v9 +; GFX9-NEXT: v_add3_u32 v12, v8, v10, v7 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s2, v12, v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v13, s11 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s10, v3 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], s3, v11, v[7:8] +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v13, v9, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 +; GFX9-NEXT: v_sub_u32_e32 v8, s11, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s2, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v10 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v11, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v9 -; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s2, v3 +; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v11 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v12, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v10 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v13 +; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s2, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14 +; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v18, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, v14, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v12, v15, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v8, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] -; GFX9-NEXT: v_xor_b32_e32 v2, s0, v7 -; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8 -; GFX9-NEXT: v_mov_b32_e32 v7, s1 -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc -; GFX9-NEXT: v_xor_b32_e32 v7, s6, v9 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_xor_b32_e32 v8, s6, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, s6 -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v9, vcc -; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[12:13] -; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[14:15] +; GFX9-NEXT: v_xor_b32_e32 v3, s0, v9 +; GFX9-NEXT: v_xor_b32_e32 v4, s1, v11 +; GFX9-NEXT: v_mov_b32_e32 v9, s1 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v9, vcc +; GFX9-NEXT: v_xor_b32_e32 v8, s6, v8 +; GFX9-NEXT: v_xor_b32_e32 v9, s6, v7 +; GFX9-NEXT: v_mov_b32_e32 v10, s6 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v8 +; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v9, v10, vcc +; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[12:13] +; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 39cf7b01fd6c0..b6f52d9b10613 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -463,108 +463,108 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v10, v9 ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v11, v[1:2] ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v13, v[9:10] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v14, v0 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v16, v11, vcc +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v14, v0 +; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], v16, v11, vcc ; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v8, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v8 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v9, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v0, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v7 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v1 -; GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], v7, v1, s[4:5] -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v1 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v1 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v0, v8, vcc -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v1, v9, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v0, v8, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v7 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v0 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v1, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v12, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v9, vcc +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v8 +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v17, v10, v1, s[4:5] ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v1, v1 ; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 -; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v18, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v5 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v16, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v21, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v22 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v0 +; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 0, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v1 +; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], 0, v7, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v18, 0 +; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v9, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v20, v19, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v21, v18, v[8:9] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v19, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v18, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v18, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_mul_hi_u32 v8, v17, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, v19, v10 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v18, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v1 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v18, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v14, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v15, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v19, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v0 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v19, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v15, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v12, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v20, v16, v[1:2] ; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v14, v[8:9] ; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v21, v15, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: v_xor_b32_e32 v11, v2, v12 -; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v14, v10 +; GISEL-NEXT: v_mul_lo_u32 v2, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v15, v10 ; GISEL-NEXT: v_xor_b32_e32 v13, v3, v12 -; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v10 +; GISEL-NEXT: v_mul_lo_u32 v3, v16, v10 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v15, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v16, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 ; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2 ; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 @@ -1100,7 +1100,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: v_mov_b32_e32 v6, 0xfffff000 +; GISEL-NEXT: v_mov_b32_e32 v7, 0xfffff000 ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 @@ -1110,78 +1110,75 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GISEL-NEXT: v_trunc_f32_e32 v5, v5 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] -; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v7, v9, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v6, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] -; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 -; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 -; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] -; GISEL-NEXT: s_mov_b32 s6, 1 -; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v19, 0 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v9, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v7, v6, v[14:15] +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], s6, v19, v[15:16] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v6, v13 +; GISEL-NEXT: v_mul_lo_u32 v14, v19, v17 +; GISEL-NEXT: v_xor_b32_e32 v20, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v6, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v6, v17 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v14, v19, v17 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v17 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0 ; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 ; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 -; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 +; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 +; GISEL-NEXT: v_mov_b32_e32 v6, 0x1000 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 @@ -1189,32 +1186,35 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v1, v0 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v14, v20, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v17, 0 +; GISEL-NEXT: s_mov_b32 s6, 1 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v14, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v15, v[1:2] +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v18, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v15 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v19, v15, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v6 ; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v6 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v9, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v6 +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v19, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v0, vcc @@ -1223,109 +1223,109 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v1, v8, v13 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v0 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v9, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v5, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9 -; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] -; GISEL-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc -; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 -; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 -; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v16, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v12, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v14, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], s6, v5, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v9, v2, v14 +; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v5, v10 +; GISEL-NEXT: v_xor_b32_e32 v11, v3, v14 +; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 +; GISEL-NEXT: v_xor_b32_e32 v10, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v0, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v12, v[7:8] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v11, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v8, -1, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v14 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v14 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2k_denom: @@ -1341,103 +1341,103 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_trunc_f32_e32 v5, v5 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] -; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] -; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v12, v9, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] +; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v8, v9, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] -; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 -; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, v9, v8, vcc +; CGP-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v19, v[14:15] +; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; CGP-NEXT: v_mad_u64_u32 v[17:18], s[4:5], -1, v4, v[15:16] +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc +; CGP-NEXT: v_xor_b32_e32 v15, v0, v8 +; CGP-NEXT: v_mul_lo_u32 v0, v19, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v4, v17 +; CGP-NEXT: v_xor_b32_e32 v16, v1, v8 +; CGP-NEXT: v_mul_hi_u32 v1, v4, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v19, v13 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v19, v17 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v17 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v19, v17 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v14, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v16, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v17, v15, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v16, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 +; CGP-NEXT: v_mul_lo_u32 v17, v16, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v17, v0 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v17, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 +; CGP-NEXT: v_mul_hi_u32 v17, v16, v1 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v0 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v14 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v15, vcc, v18, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v19, v13, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v14 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v13 +; CGP-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 ; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v16, -1, v1, s[4:5] ; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[5:6] ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v18, v4 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] ; CGP-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v19, vcc @@ -1445,89 +1445,89 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc ; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc -; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v11 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_mul_lo_u32 v10, v9, v13 ; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 ; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v13 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v13 ; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 ; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v11, v1 +; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v0 +; CGP-NEXT: v_addc_u32_e64 v11, s[4:5], v9, v1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] -; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v11, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v5, v8 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[9:10] ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 -; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 -; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 +; CGP-NEXT: v_xor_b32_e32 v9, v2, v13 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 +; CGP-NEXT: v_xor_b32_e32 v10, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 -; CGP-NEXT: v_xor_b32_e32 v11, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v2 +; CGP-NEXT: v_xor_b32_e32 v12, v14, v8 +; CGP-NEXT: v_mul_hi_u32 v7, v9, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v12, v8 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v7, v10, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v9, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v7, v10, v2 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v3 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v7, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v10, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v10, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1689,7 +1689,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; GISEL-NEXT: v_mov_b32_e32 v7, 0xffed2705 ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 @@ -1699,78 +1699,75 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GISEL-NEXT: v_trunc_f32_e32 v5, v5 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v5 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] -; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[9:10] -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v7, v9, v[5:6] +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v6, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v17, v[14:15] -; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 -; GISEL-NEXT: v_mul_hi_u32 v18, v4, v13 -; GISEL-NEXT: v_mul_hi_u32 v19, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v4, v[15:16] -; GISEL-NEXT: s_mov_b32 s6, 1 -; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v19, 0 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v9, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v7, v6, v[14:15] +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], s6, v19, v[15:16] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v6, v13 +; GISEL-NEXT: v_mul_lo_u32 v14, v19, v17 +; GISEL-NEXT: v_xor_b32_e32 v20, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v6, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v6, v17 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v14, v19, v17 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v17 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0 ; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 ; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 -; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb +; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 +; GISEL-NEXT: v_mov_b32_e32 v6, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 @@ -1778,32 +1775,35 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_mul_hi_u32 v1, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v0, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v1, v0 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v17, 0 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v14, v20, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v17, 0 +; GISEL-NEXT: s_mov_b32 s6, 1 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v14, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v15, v[1:2] +; GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v17, v[13:14] ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v18, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v15 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v19, v15, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v6 ; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v6 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v7, v13 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v9, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v6 +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v19, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v0, vcc @@ -1812,109 +1812,109 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GISEL-NEXT: v_mul_hi_u32 v1, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v1, v8, v13 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v0 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v9, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v5, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9 -; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8] -; GISEL-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc -; GISEL-NEXT: v_xor_b32_e32 v14, v2, v13 -; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 -; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v16, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v12, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v14, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], s6, v5, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v9, v2, v14 +; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v5, v10 +; GISEL-NEXT: v_xor_b32_e32 v11, v3, v14 +; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v14, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 +; GISEL-NEXT: v_xor_b32_e32 v10, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v15, v2 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v15, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[3:4] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v9 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v15, v7, vcc -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v0, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v12, v[7:8] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v11, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v8, -1, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v14 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v14 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_oddk_denom: @@ -1930,103 +1930,103 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_trunc_f32_e32 v5, v5 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v5 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[5:6] -; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v7, v[9:10] -; CGP-NEXT: v_mul_lo_u32 v9, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v12 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v12, v9, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] +; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v8, v9, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v8, v12, vcc -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v4, 0 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v16, v[13:14] -; CGP-NEXT: v_mul_lo_u32 v17, v16, v12 -; CGP-NEXT: v_mul_hi_u32 v18, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v19, v16, v12 -; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], -1, v4, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v13, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v15, v4, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, v9, v8, vcc +; CGP-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v6, v19, v[14:15] +; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; CGP-NEXT: v_mad_u64_u32 v[17:18], s[4:5], -1, v4, v[15:16] +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc +; CGP-NEXT: v_xor_b32_e32 v15, v0, v8 +; CGP-NEXT: v_mul_lo_u32 v0, v19, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v4, v17 +; CGP-NEXT: v_xor_b32_e32 v16, v1, v8 +; CGP-NEXT: v_mul_hi_u32 v1, v4, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v19, v13 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v1, v19, v17 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v17 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v16, v12 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; CGP-NEXT: v_mul_hi_u32 v17, v16, v12 -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v12 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v13 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v12 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v19, v17 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v14, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v16, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v17, v15, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v16, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 +; CGP-NEXT: v_mul_lo_u32 v17, v16, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v17, v0 +; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v17, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 +; CGP-NEXT: v_mul_hi_u32 v17, v16, v1 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v0 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v14 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v15, vcc, v18, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v19, v13, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v14 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v13 +; CGP-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 ; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 -; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v16, -1, v1, s[4:5] ; CGP-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[5:6] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[5:6] ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v18, v4 ; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[0:1] ; CGP-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v19, vcc @@ -2034,89 +2034,89 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 ; CGP-NEXT: v_cndmask_b32_e32 v0, v18, v5, vcc ; CGP-NEXT: v_cndmask_b32_e32 v5, v19, v21, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; CGP-NEXT: v_cndmask_b32_e32 v14, v15, v0, vcc -; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v9, v1 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v10 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v0, v11 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_mul_lo_u32 v10, v9, v13 ; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 ; CGP-NEXT: v_mul_hi_u32 v1, v7, v13 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v13 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v13 ; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v1, v0 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v9, v1 ; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v7, v0 -; CGP-NEXT: v_addc_u32_e64 v10, s[4:5], v8, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0 -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc +; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v11, v1 +; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v0 +; CGP-NEXT: v_addc_u32_e64 v11, s[4:5], v9, v1, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v10, v[1:2] -; CGP-NEXT: v_xor_b32_e32 v1, v5, v12 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v11, v[1:2] +; CGP-NEXT: v_xor_b32_e32 v1, v5, v8 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v9, v[7:8] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[9:10] ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v2, v13 -; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v9, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v3, v13 -; CGP-NEXT: v_mul_hi_u32 v3, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 +; CGP-NEXT: v_xor_b32_e32 v9, v2, v13 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 +; CGP-NEXT: v_xor_b32_e32 v10, v3, v13 +; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v10, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 -; CGP-NEXT: v_xor_b32_e32 v11, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v12 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v9, v2 +; CGP-NEXT: v_xor_b32_e32 v12, v14, v8 +; CGP-NEXT: v_mul_hi_u32 v7, v9, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v12, v8 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v7, v10, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v9, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v7, v10, v2 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v3 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v7, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v10, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v10, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -2355,21 +2355,21 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v16, v[9:10] ; GISEL-NEXT: v_mul_lo_u32 v9, v16, v8 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v4, v[12:13] -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v13, v4, v8 ; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8 +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v14 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_mul_hi_u32 v13, v4, v14 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v14 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; GISEL-NEXT: v_mul_hi_u32 v13, v16, v14 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc @@ -2439,102 +2439,102 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v7 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v6, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v0, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v11 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v10, v1 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v1, s[4:5] -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v1 -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v1 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v7 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v0, v7, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v15, v1, v6, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v0, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v1, v0 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v8 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v13, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v17, s[4:5], 0, v9, vcc +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v17, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v5 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v7 -; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v0, v7, vcc -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v7 +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v18, v10, v1, s[4:5] ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v1, v1 ; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 -; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v10, v11, s[4:5] -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v17, 0 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v18, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v21, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v22, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v23, v18, v0 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v15, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v0 +; GISEL-NEXT: v_sub_i32_e64 v21, s[4:5], 0, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v20, v1 +; GISEL-NEXT: v_subb_u32_e64 v22, s[4:5], 0, v8, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v21, v19, 0 +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v9, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v21, v20, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v5 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v22, v19, v[9:10] ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v7, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v17, v[9:10] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v21, v1 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v16, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v20, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v19, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v19, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v22 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v20, v11 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_mul_hi_u32 v9, v17, v0 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v23 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v19, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v1 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v18, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v20, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v20, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v21, v16, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 ; GISEL-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v16, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v21, v17, v[1:2] ; GISEL-NEXT: v_xor_b32_e32 v1, v7, v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v15, v[9:10] +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v22, v16, v[9:10] ; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: v_xor_b32_e32 v12, v2, v7 -; GISEL-NEXT: v_mul_lo_u32 v2, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v15, v11 +; GISEL-NEXT: v_mul_lo_u32 v2, v17, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v16, v11 ; GISEL-NEXT: v_xor_b32_e32 v13, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v16, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v16, v11 +; GISEL-NEXT: v_mul_lo_u32 v3, v17, v11 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v9, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v16, v11 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v16, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v17, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 ; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 ; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index c50b491bcb074..c4a9f989e49c8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -1042,10 +1042,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: s_sub_u32 s2, 0, s14 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] ; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 ; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 +; GFX8-NEXT: s_subb_u32 s3, 0, s15 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 @@ -1086,109 +1088,107 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 ; GFX8-NEXT: v_mul_hi_u32 v3, s9, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v8, 0 -; GFX8-NEXT: v_add_u32_e64 v17, s[2:3], 1, v8 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v3, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v10, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s8, v0 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s13, v8, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s14 ; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v4 -; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s15 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v1 -; GFX8-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 -; GFX8-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1] +; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s15 ; GFX8-NEXT: v_subb_u32_e32 v12, vcc, v2, v9, vcc -; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v6, s[0:1] +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s14 +; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3 +; GFX8-NEXT: v_subrev_u32_e32 v13, vcc, s12, v1 +; GFX8-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX8-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v12, vcc +; GFX8-NEXT: v_add_u32_e64 v15, s[0:1], 1, v8 +; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX8-NEXT: v_trunc_f32_e32 v3, v3 ; GFX8-NEXT: v_mul_f32_e32 v4, 0xcf800000, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v4, v2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v13, v2 -; GFX8-NEXT: s_sub_u32 s8, 0, s14 -; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v3 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s12, v1 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v13, 0 -; GFX8-NEXT: v_subbrev_u32_e64 v16, s[0:1], 0, v12, vcc -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v14, v[3:4] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v16 -; GFX8-NEXT: s_subb_u32 s9, 0, s15 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s9, v13, v[4:5] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v15 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v5, v14, v2 -; GFX8-NEXT: v_mul_lo_u32 v7, v13, v6 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v16 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; GFX8-NEXT: v_mul_hi_u32 v4, v13, v2 -; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v5, v4 -; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v12, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2 -; GFX8-NEXT: v_mul_lo_u32 v9, v14, v6 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_mul_hi_u32 v7, v13, v6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_addc_u32_e64 v18, s[2:3], 0, v10, s[2:3] -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 1, v17 -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v18, vcc -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v15 -; GFX8-NEXT: v_mul_hi_u32 v6, v14, v6 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v4, vcc +; GFX8-NEXT: v_cvt_u32_f32_e32 v17, v2 +; GFX8-NEXT: v_addc_u32_e64 v16, s[0:1], 0, v10, s[0:1] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; GFX8-NEXT: v_cvt_u32_f32_e32 v18, v3 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v17, 0 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v13 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v17, v[4:5] +; GFX8-NEXT: v_mul_lo_u32 v4, v18, v2 +; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v12, v9, vcc +; GFX8-NEXT: v_mul_lo_u32 v5, v17, v6 +; GFX8-NEXT: v_mul_hi_u32 v12, v17, v2 +; GFX8-NEXT: v_mul_hi_u32 v2, v18, v2 +; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], 1, v15 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v12, v18, v6 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_mul_hi_u32 v5, v17, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v12, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v12, v5 +; GFX8-NEXT: v_subrev_u32_e32 v12, vcc, s12, v13 +; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v9, vcc +; GFX8-NEXT: v_mul_hi_u32 v6, v18, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v2 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v13, 0 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v2 +; GFX8-NEXT: v_addc_u32_e64 v19, s[0:1], 0, v16, s[0:1] +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, v18, v4, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v17, 0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v17, v9, vcc -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s8, v14, v[5:6] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v15, v7, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v18, v[5:6] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s9, v13, v[6:7] -; GFX8-NEXT: v_mul_lo_u32 v6, v14, v4 -; GFX8-NEXT: v_mul_hi_u32 v9, v13, v4 -; GFX8-NEXT: v_mul_lo_u32 v7, v13, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v18, v12, vcc +; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s3, v17, v[6:7] +; GFX8-NEXT: v_mul_lo_u32 v6, v18, v4 +; GFX8-NEXT: v_mul_hi_u32 v9, v17, v4 +; GFX8-NEXT: v_mul_lo_u32 v7, v17, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v19, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v19, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v20, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, v14, v8 -; GFX8-NEXT: v_mul_hi_u32 v4, v14, v4 +; GFX8-NEXT: v_mul_lo_u32 v9, v18, v8 +; GFX8-NEXT: v_mul_hi_u32 v4, v18, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_mul_hi_u32 v7, v13, v8 +; GFX8-NEXT: v_mul_hi_u32 v7, v17, v8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v9, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v8, v14, v8 +; GFX8-NEXT: v_mul_hi_u32 v8, v18, v8 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v13, v4 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v14, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v17, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v18, v6, vcc ; GFX8-NEXT: v_mul_lo_u32 v8, s11, v4 ; GFX8-NEXT: v_mul_lo_u32 v9, s10, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1] @@ -1229,30 +1229,30 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[0:1] ; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s14, v8 ; GFX8-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v0, vcc +; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v11 +; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v12, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v11 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc -; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v12, s[0:1] -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v14 -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s14, v9 -; GFX8-NEXT: v_subbrev_u32_e64 v0, s[0:1], 0, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v10 +; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s14, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13 +; GFX8-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc +; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1] +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v18, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v9, v1, v0, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v11, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v12, v14, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v12, v13, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -1272,9 +1272,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, s5 -; GFX9-NEXT: s_sub_u32 s8, 0, s6 -; GFX9-NEXT: s_subb_u32 s9, 0, s7 +; GFX9-NEXT: v_mov_b32_e32 v10, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -1310,10 +1308,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: s_sub_u32 s2, 0, s6 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] ; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 +; GFX9-NEXT: s_subb_u32 s3, 0, s7 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 @@ -1348,174 +1348,174 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v8, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v10, v3, v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v10, v[1:2] -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v8, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 -; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v2, v4, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, s17, v4 +; GFX9-NEXT: v_add3_u32 v11, v3, v0, v5 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s4, v11, v[2:3] +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s16, v1 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s5, v9, v[3:4] +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v3, v5, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, s17, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v4, v5, s[0:1] ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s7 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v10, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s6 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 -; GFX9-NEXT: v_add_f32_e32 v4, v4, v5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v9, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v4, 0xcf800000, v3 -; GFX9-NEXT: v_add_f32_e32 v2, v4, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v3 -; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s4, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v13, 0 -; GFX9-NEXT: v_subbrev_co_u32_e64 v16, s[0:1], 0, v12, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v14, v[3:4] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v16 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s9, v13, v[4:5] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v15 +; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s4, v2 +; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: v_subbrev_co_u32_e64 v15, s[0:1], 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e64 v16, s[0:1], 1, v9 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v5, 0xcf800000, v4 +; GFX9-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v3 +; GFX9-NEXT: v_addc_co_u32_e64 v17, s[0:1], 0, v11, s[0:1] +; GFX9-NEXT: v_cvt_u32_f32_e32 v19, v4 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v18, 0 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v19, v[4:5] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v14 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v5, v14, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v13, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v16 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v4, v13, v2 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v5, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v12, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v2, v14, v2 -; GFX9-NEXT: v_mul_lo_u32 v9, v14, v6 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v13, v6 -; GFX9-NEXT: v_add_co_u32_e64 v17, s[2:3], 1, v8 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v18, s[2:3], 0, v10, s[2:3] -; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 1, v17 -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v18, vcc -; GFX9-NEXT: v_mul_hi_u32 v6, v14, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v15 -; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v4, v7, v4, v6 -; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v13, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v9, vcc -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s8, v14, v[5:6] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s9, v13, v[6:7] -; GFX9-NEXT: v_mul_lo_u32 v6, v14, v4 -; GFX9-NEXT: v_mul_hi_u32 v9, v13, v4 -; GFX9-NEXT: v_mul_lo_u32 v7, v13, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v15, v19, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v20, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v18, v[5:6] +; GFX9-NEXT: v_mul_lo_u32 v5, v19, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v13, v10, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v18, v7 +; GFX9-NEXT: v_mul_hi_u32 v10, v18, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, v19, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v15 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, v14, v8 -; GFX9-NEXT: v_mul_hi_u32 v4, v14, v4 -; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_mul_hi_u32 v7, v13, v8 -; GFX9-NEXT: v_mul_hi_u32 v8, v14, v8 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v9, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 -; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v19, v7 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_mul_hi_u32 v6, v18, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, v19, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v6, v7, v6, v8 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, s19, v4 -; GFX9-NEXT: v_mul_lo_u32 v9, s18, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v1, s18, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v9 +; GFX9-NEXT: v_add_u32_e32 v6, v10, v6 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 1, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s4, v14 +; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v8, vcc +; GFX9-NEXT: v_add3_u32 v5, v6, v5, v7 +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, v18, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, v19, v5, vcc +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, 0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v16, v10, vcc +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s2, v19, v[6:7] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[2:3], s3, v18, v[7:8] +; GFX9-NEXT: v_mul_lo_u32 v7, v19, v5 +; GFX9-NEXT: v_mul_hi_u32 v10, v18, v5 +; GFX9-NEXT: v_mul_lo_u32 v8, v18, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v11, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v20, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, v15, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v5, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s19, v7 -; GFX9-NEXT: v_add_u32_e32 v1, v8, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, s18, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, s19, v7 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v19, v9 +; GFX9-NEXT: v_mul_hi_u32 v5, v19, v5 +; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 +; GFX9-NEXT: v_mul_hi_u32 v8, v18, v9 +; GFX9-NEXT: v_mul_hi_u32 v9, v19, v9 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v10, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v4, v1 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s6, v11, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 -; GFX9-NEXT: v_add3_u32 v12, v8, v1, v7 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s6, v12, v[5:6] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[8:9] -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s18, v4 -; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v1, v0, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, s19, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s6, v8 -; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 +; GFX9-NEXT: v_add_u32_e32 v8, v10, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v7, v8, v7, v9 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v18, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v19, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v9, s19, v5 +; GFX9-NEXT: v_mul_lo_u32 v10, s18, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v7, v2, v6, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v2, s18, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, s19, v5 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, s19, v8 +; GFX9-NEXT: v_add_u32_e32 v2, v9, v2 +; GFX9-NEXT: v_mul_hi_u32 v9, s18, v8 +; GFX9-NEXT: v_mul_hi_u32 v8, s19, v8 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v5, v2 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s6, v12, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v9, v10, v9 +; GFX9-NEXT: v_add3_u32 v13, v9, v2, v8 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[2:3], s6, v13, v[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v11, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v12, v[9:10] +; GFX9-NEXT: v_mov_b32_e32 v2, s19 +; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s18, v5 +; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v1, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2 +; GFX9-NEXT: v_sub_u32_e32 v1, s19, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v11 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 1, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s6, v9 -; GFX9-NEXT: v_subbrev_co_u32_e64 v0, s[0:1], 0, v0, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v11, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v14, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v1, v0, s[0:1] -; GFX9-NEXT: global_store_dwordx4 v13, v[2:5], s[12:13] -; GFX9-NEXT: global_store_dwordx4 v13, v[6:9], s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s6, v9 +; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v12 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v13, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v10 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v11 +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s6, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14 +; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v13, v14, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v2, v1, s[0:1] +; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[12:13] +; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll index 4c62409a85c00..dfc190998579a 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll @@ -203,83 +203,83 @@ define void @ds_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(3) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[0:31] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a1 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_write_b32 a18, v31 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; CHECK-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v32, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v33, a1 +; CHECK-NEXT: ds_wrxchg_rtn_b32 v32, v32, v33 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_accvgpr_write_b32 a31, v18 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a30, v19 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a29, v20 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a28, v21 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a27, v22 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a26, v23 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a25, v24 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a24, v25 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a23, v26 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a22, v27 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a21, v28 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a20, v29 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a19, v30 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; CHECK-NEXT: v_accvgpr_read_b32 v18, a31 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v19, a30 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v20, a29 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v21, a28 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v22, a27 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v23, a26 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v24, a25 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v25, a24 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v26, a23 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v27, a22 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v28, a21 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v29, a20 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v30, a19 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a31, v50 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a30, v51 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a29, v52 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a28, v53 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a27, v54 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a26, v55 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a25, v56 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a24, v57 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a23, v58 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a22, v59 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a21, v60 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a20, v61 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a19, v62 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a0, v32 +; CHECK-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; CHECK-NEXT: v_accvgpr_read_b32 v50, a31 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v51, a30 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v52, a29 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v53, a28 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v54, a27 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v55, a26 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v56, a25 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v57, a24 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v58, a23 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v59, a22 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v60, a21 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v61, a20 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v62, a19 ; Reload Reuse ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a0 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_accvgpr_read_b32 v31, a18 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use v[0:31] ; CHECK-NEXT: ;;#ASMEND @@ -764,83 +764,83 @@ define void @ds_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(3) %ptr) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[0:31] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def a1 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_write_b32 a18, v31 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: ds_xor_rtn_b32 v0, v0, v1 +; CHECK-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v32, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v33, a1 +; CHECK-NEXT: ds_xor_rtn_b32 v32, v32, v33 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_accvgpr_write_b32 a31, v18 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a30, v19 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a29, v20 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a28, v21 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a27, v22 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a26, v23 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a25, v24 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a24, v25 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a23, v26 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a22, v27 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a21, v28 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a20, v29 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a19, v30 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; CHECK-NEXT: v_accvgpr_read_b32 v18, a31 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v19, a30 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v20, a29 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v21, a28 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v22, a27 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v23, a26 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v24, a25 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v25, a24 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v26, a23 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v27, a22 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v28, a21 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v29, a20 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v30, a19 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a31, v50 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a30, v51 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a29, v52 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a28, v53 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a27, v54 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a26, v55 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a25, v56 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a24, v57 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a23, v58 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a22, v59 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a21, v60 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a20, v61 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a19, v62 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a0, v32 +; CHECK-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; CHECK-NEXT: v_accvgpr_read_b32 v50, a31 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v51, a30 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v52, a29 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v53, a28 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v54, a27 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v55, a26 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v56, a25 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v57, a24 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v58, a23 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v59, a22 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v60, a21 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v61, a20 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v62, a19 ; Reload Reuse ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a0 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_accvgpr_read_b32 v31, a18 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use v[0:31] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index 5cceb918b755e..5f2a8ca037222 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -358,88 +358,88 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a2 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v33, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v32, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v34, a2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: flat_atomic_swap v32, v[32:33], v34 offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v51 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v52 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v53 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v54 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v55 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v56 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v57 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v58 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v59 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v60 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v61 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v62 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v32 +; GFX90A-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: v_accvgpr_read_b32 v51, a31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v52, a30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v53, a29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v54, a28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v55, a27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v56, a26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v57, a25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v58, a24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v59, a23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v60, a22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v61, a21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v62, a20 ; Reload Reuse ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v63, a19 ; Reload Reuse ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use v[0:31] ; GFX90A-NEXT: ;;#ASMEND @@ -485,59 +485,59 @@ define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[32:35], s32 ; 16-byte Folded Spill ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[36:39], s32 offset:16 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[40:43], s32 offset:32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[44:47], s32 offset:48 ; 16-byte Folded Spill ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: v_accvgpr_write_b32 a19, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a20, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a21, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a22, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v33, a1 +; GFX950-NEXT: scratch_store_dwordx3 off, v[48:50], s32 offset:64 ; 12-byte Folded Spill +; GFX950-NEXT: v_accvgpr_read_b32 v32, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v34, a2 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap v32, v[32:33], v34 offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a31, v51 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a27, v55 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a28, v54 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a29, v53 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a30, v52 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a23, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a24, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a25, v57 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a26, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v32 +; GFX950-NEXT: scratch_load_dwordx4 v[32:35], off, s32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[36:39], off, s32 offset:16 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[40:43], off, s32 offset:32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[44:47], off, s32 offset:48 ; 16-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v51, a31 ; Reload Reuse +; GFX950-NEXT: scratch_load_dwordx3 v[48:50], off, s32 offset:64 ; 12-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v55, a27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v54, a28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v53, a29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v52, a30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a26 ; Reload Reuse ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v63, a19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v62, a20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v61, a21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v60, a22 ; Reload Reuse ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use v[0:31] ; GFX950-NEXT: ;;#ASMEND @@ -2071,25 +2071,24 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_av_no_agprs: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX90A-NEXT: v_accvgpr_write_b32 a33, v1 ; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0 ; GFX90A-NEXT: ;;#ASMSTART @@ -2127,20 +2126,19 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a32 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a33 -; GFX90A-NEXT: flat_load_dword v1, v[2:3] +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a33 +; GFX90A-NEXT: flat_load_dword v1, v[4:5] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a34 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a34 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v0, v1, v4 +; GFX90A-NEXT: v_xor_b32_e32 v0, v1, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol @@ -2190,58 +2188,53 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a32 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:68 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:64 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:60 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:56 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:52 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:48 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:44 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:40 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:36 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:32 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:28 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:24 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:20 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:16 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:12 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:8 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:4 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a33, s32 ; 4-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a33, v1 ; GFX950-NEXT: v_accvgpr_write_b32 a32, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def a34 -; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 @@ -2275,16 +2268,18 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX950-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX950-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a32 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a33 -; GFX950-NEXT: flat_load_dword v1, v[2:3] -; GFX950-NEXT: v_accvgpr_read_b32 v4, a34 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a32 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a33 +; GFX950-NEXT: flat_load_dword v1, v[4:5] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_xor_b32_e32 v0, v1, v4 +; GFX950-NEXT: v_xor_b32_e32 v0, v1, v2 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -2333,25 +2328,24 @@ define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a32 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a33, off, s32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:4 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:8 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:12 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:16 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:20 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:24 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:28 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:36 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:40 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:44 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:48 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:52 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:56 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:60 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:64 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:68 ; 4-byte Folded Reload ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 @@ -4031,86 +4025,86 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a2 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v33, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v32, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v34, a2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GFX90A-NEXT: flat_atomic_xor v32, v[32:33], v34 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v51 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v52 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v53 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v54 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v55 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v56 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v57 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v58 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v59 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v60 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v61 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v62 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v32 +; GFX90A-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: v_accvgpr_read_b32 v51, a31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v52, a30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v53, a29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v54, a28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v55, a27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v56, a26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v57, a25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v58, a24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v59, a23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v60, a22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v61, a21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v62, a20 ; Reload Reuse ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v63, a19 ; Reload Reuse ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use v[0:31] ; GFX90A-NEXT: ;;#ASMEND @@ -4156,59 +4150,59 @@ define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 { ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[32:35], s32 ; 16-byte Folded Spill ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[36:39], s32 offset:16 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[40:43], s32 offset:32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[44:47], s32 offset:48 ; 16-byte Folded Spill ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: v_accvgpr_write_b32 a19, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a20, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a21, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a22, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v33, a1 +; GFX950-NEXT: scratch_store_dwordx3 off, v[48:50], s32 offset:64 ; 12-byte Folded Spill +; GFX950-NEXT: v_accvgpr_read_b32 v32, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v34, a2 ; GFX950-NEXT: buffer_wbl2 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 +; GFX950-NEXT: flat_atomic_xor v32, v[32:33], v34 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: buffer_inv sc1 -; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a31, v51 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a27, v55 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a28, v54 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a29, v53 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a30, v52 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a23, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a24, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a25, v57 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a26, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v32 +; GFX950-NEXT: scratch_load_dwordx4 v[32:35], off, s32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[36:39], off, s32 offset:16 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[40:43], off, s32 offset:32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[44:47], off, s32 offset:48 ; 16-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v51, a31 ; Reload Reuse +; GFX950-NEXT: scratch_load_dwordx3 v[48:50], off, s32 offset:64 ; 12-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v55, a27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v54, a28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v53, a29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v52, a30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a26 ; Reload Reuse ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v63, a19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v62, a20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v61, a21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v60, a22 ; Reload Reuse ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use v[0:31] ; GFX950-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll index b6fe0c756a106..a9859b97ba623 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll @@ -358,88 +358,88 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a2 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v33, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v32, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v34, a2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: global_atomic_swap v32, v[32:33], v34, off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v51 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v52 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v53 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v54 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v55 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v56 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v57 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v58 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v59 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v60 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v61 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v62 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v32 +; GFX90A-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: v_accvgpr_read_b32 v51, a31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v52, a30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v53, a29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v54, a28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v55, a27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v56, a26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v57, a25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v58, a24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v59, a23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v60, a22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v61, a21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v62, a20 ; Reload Reuse ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v63, a19 ; Reload Reuse ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use v[0:31] ; GFX90A-NEXT: ;;#ASMEND @@ -485,59 +485,59 @@ define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[32:35], s32 ; 16-byte Folded Spill ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[36:39], s32 offset:16 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[40:43], s32 offset:32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[44:47], s32 offset:48 ; 16-byte Folded Spill ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: v_accvgpr_write_b32 a19, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a20, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a21, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a22, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v33, a1 +; GFX950-NEXT: scratch_store_dwordx3 off, v[48:50], s32 offset:64 ; 12-byte Folded Spill +; GFX950-NEXT: v_accvgpr_read_b32 v32, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v34, a2 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 +; GFX950-NEXT: global_atomic_swap v32, v[32:33], v34, off offset:40 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 -; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a31, v51 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a27, v55 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a28, v54 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a29, v53 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a30, v52 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a23, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a24, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a25, v57 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a26, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v32 +; GFX950-NEXT: scratch_load_dwordx4 v[32:35], off, s32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[36:39], off, s32 offset:16 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[40:43], off, s32 offset:32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[44:47], off, s32 offset:48 ; 16-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v51, a31 ; Reload Reuse +; GFX950-NEXT: scratch_load_dwordx3 v[48:50], off, s32 offset:64 ; 12-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v55, a27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v54, a28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v53, a29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v52, a30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a26 ; Reload Reuse ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v63, a19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v62, a20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v61, a21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v60, a22 ; Reload Reuse ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use v[0:31] ; GFX950-NEXT: ;;#ASMEND @@ -1576,25 +1576,24 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_av_av_no_agprs: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a34, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX90A-NEXT: v_accvgpr_write_b32 a33, v1 ; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0 ; GFX90A-NEXT: ;;#ASMSTART @@ -1632,20 +1631,19 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a32 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a33 -; GFX90A-NEXT: global_load_dword v1, v[2:3], off +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a33 +; GFX90A-NEXT: global_load_dword v1, v[4:5], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a34 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a34 ; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_xor_b32_e32 v0, v1, v4 +; GFX90A-NEXT: v_xor_b32_e32 v0, v1, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1], off glc +; GFX90A-NEXT: global_atomic_cmpswap v0, v[4:5], v[0:1], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol @@ -1695,58 +1693,53 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a32 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_load_dword a34, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_av_av_no_agprs: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:72 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:68 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:64 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:60 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:56 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:52 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:48 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:44 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:40 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:36 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:32 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:28 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:24 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:20 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:16 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:12 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:8 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a33, s32 offset:4 ; 4-byte Folded Spill -; GFX950-NEXT: scratch_store_dword off, a34, s32 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:68 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:64 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:60 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:56 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:52 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:48 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:44 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:40 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:36 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:32 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:28 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:24 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:20 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:16 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:12 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:8 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:4 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a33, s32 ; 4-byte Folded Spill ; GFX950-NEXT: v_accvgpr_write_b32 a33, v1 ; GFX950-NEXT: v_accvgpr_write_b32 a32, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def a34 -; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 @@ -1780,16 +1773,18 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX950-NEXT: v_accvgpr_write_b32 a29, v29 ; GFX950-NEXT: v_accvgpr_write_b32 a30, v30 ; GFX950-NEXT: v_accvgpr_write_b32 a31, v31 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a32 -; GFX950-NEXT: v_accvgpr_read_b32 v3, a33 -; GFX950-NEXT: global_load_dword v1, v[2:3], off -; GFX950-NEXT: v_accvgpr_read_b32 v4, a34 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a32 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a33 +; GFX950-NEXT: global_load_dword v1, v[4:5], off +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_xor_b32_e32 v0, v1, v4 +; GFX950-NEXT: v_xor_b32_e32 v0, v1, v2 ; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1], off sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v0, v[4:5], v[0:1], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc0 sc1 ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -1838,25 +1833,24 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a32 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_load_dword a34, off, s32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword a33, off, s32 offset:4 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:8 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:12 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:16 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:20 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:24 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:28 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:32 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:36 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:40 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:44 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:48 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:52 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:56 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:60 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:64 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:68 ; 4-byte Folded Reload -; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:72 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a33, off, s32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:4 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:8 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:12 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:16 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:20 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:24 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:28 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:36 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:40 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:44 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:48 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:52 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:56 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:60 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:64 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:68 ; 4-byte Folded Reload ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -3012,86 +3006,86 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:31] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a2 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v33, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v32, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v34, a2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_xor v0, v[0:1], v2, off glc +; GFX90A-NEXT: global_atomic_xor v32, v[32:33], v34, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v51 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v52 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v53 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v54 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v55 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v56 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v57 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v58 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v59 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v60 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v61 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v62 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v32 +; GFX90A-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: v_accvgpr_read_b32 v51, a31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v52, a30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v53, a29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v54, a28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v55, a27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v56, a26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v57, a25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v58, a24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v59, a23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v60, a22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v61, a21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v62, a20 ; Reload Reuse ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v63, a19 ; Reload Reuse ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use v[0:31] ; GFX90A-NEXT: ;;#ASMEND @@ -3137,59 +3131,59 @@ define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v[0:31] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[32:35], s32 ; 16-byte Folded Spill ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill -; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[36:39], s32 offset:16 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[40:43], s32 offset:32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[44:47], s32 offset:48 ; 16-byte Folded Spill ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a2 ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill -; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 -; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: v_accvgpr_write_b32 a19, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a20, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a21, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a22, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v33, a1 +; GFX950-NEXT: scratch_store_dwordx3 off, v[48:50], s32 offset:64 ; 12-byte Folded Spill +; GFX950-NEXT: v_accvgpr_read_b32 v32, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v34, a2 ; GFX950-NEXT: buffer_wbl2 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 +; GFX950-NEXT: global_atomic_xor v32, v[32:33], v34, off sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: buffer_inv sc1 -; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload -; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload -; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a31, v51 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a27, v55 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a28, v54 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a29, v53 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a30, v52 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a23, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a24, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a25, v57 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a26, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v32 +; GFX950-NEXT: scratch_load_dwordx4 v[32:35], off, s32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[36:39], off, s32 offset:16 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[40:43], off, s32 offset:32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[44:47], off, s32 offset:48 ; 16-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v51, a31 ; Reload Reuse +; GFX950-NEXT: scratch_load_dwordx3 v[48:50], off, s32 offset:64 ; 12-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v55, a27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v54, a28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v53, a29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v52, a30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a26 ; Reload Reuse ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v63, a19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v62, a20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v61, a21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v60, a22 ; Reload Reuse ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use v[0:31] ; GFX950-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index b8814b64735e6..13ce63115d2c0 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -474,44 +474,44 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s6, s11, s39 -; GFX6-NEXT: s_add_i32 s7, s10, s38 -; GFX6-NEXT: s_add_i32 s10, s15, s43 -; GFX6-NEXT: s_add_i32 s11, s14, s42 -; GFX6-NEXT: s_add_i32 s14, s19, s47 -; GFX6-NEXT: s_add_i32 s15, s18, s46 -; GFX6-NEXT: s_add_i32 s18, s23, s51 -; GFX6-NEXT: s_add_i32 s19, s22, s50 -; GFX6-NEXT: s_add_i32 s21, s21, s49 -; GFX6-NEXT: s_add_i32 s20, s20, s48 -; GFX6-NEXT: s_add_i32 s17, s17, s45 -; GFX6-NEXT: s_add_i32 s16, s16, s44 -; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NEXT: v_mov_b32_e32 v2, s19 -; GFX6-NEXT: v_mov_b32_e32 v3, s18 -; GFX6-NEXT: s_add_i32 s13, s13, s41 -; GFX6-NEXT: s_add_i32 s12, s12, s40 +; GFX6-NEXT: s_add_i32 s4, s11, s39 +; GFX6-NEXT: s_add_i32 s5, s10, s38 +; GFX6-NEXT: s_add_i32 s6, s9, s37 +; GFX6-NEXT: s_add_i32 s7, s8, s36 +; GFX6-NEXT: s_add_i32 s8, s15, s43 +; GFX6-NEXT: s_add_i32 s9, s14, s42 +; GFX6-NEXT: s_add_i32 s10, s13, s41 +; GFX6-NEXT: s_add_i32 s11, s12, s40 +; GFX6-NEXT: s_add_i32 s12, s19, s47 +; GFX6-NEXT: s_add_i32 s13, s18, s46 +; GFX6-NEXT: s_add_i32 s14, s17, s45 +; GFX6-NEXT: s_add_i32 s15, s16, s44 +; GFX6-NEXT: s_add_i32 s16, s23, s51 +; GFX6-NEXT: s_add_i32 s17, s22, s50 +; GFX6-NEXT: s_add_i32 s18, s21, s49 +; GFX6-NEXT: s_add_i32 s19, s20, s48 +; GFX6-NEXT: v_mov_b32_e32 v0, s19 +; GFX6-NEXT: v_mov_b32_e32 v1, s18 +; GFX6-NEXT: v_mov_b32_e32 v2, s17 +; GFX6-NEXT: v_mov_b32_e32 v3, s16 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mov_b32_e32 v1, s17 -; GFX6-NEXT: v_mov_b32_e32 v2, s15 -; GFX6-NEXT: v_mov_b32_e32 v3, s14 -; GFX6-NEXT: s_add_i32 s9, s9, s37 -; GFX6-NEXT: s_add_i32 s8, s8, s36 +; GFX6-NEXT: v_mov_b32_e32 v0, s15 +; GFX6-NEXT: v_mov_b32_e32 v1, s14 +; GFX6-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NEXT: v_mov_b32_e32 v3, s12 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NEXT: v_mov_b32_e32 v1, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s11 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: v_mov_b32_e32 v3, s8 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index 4df82946343b5..e0eaab5753b18 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -1,5 +1,5 @@ -; RUN: opt -passes=amdgpu-attributor -mcpu=kaveri -mattr=-promote-alloca < %s | llc | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s -; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 -mattr=-promote-alloca < %s | llc | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s +; RUN: opt -passes=amdgpu-attributor -mcpu=kaveri -mattr=-promote-alloca < %s | llc -amdgpu-use-amdgpu-trackers=0 | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s +; RUN: opt -passes=amdgpu-attributor -mcpu=gfx900 -mattr=-promote-alloca < %s | llc -amdgpu-use-amdgpu-trackers=0 | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s target triple = "amdgcn-amd-amdhsa" diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index d965a3dbcc8a4..2a5097362ba6b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -2867,6 +2867,7 @@ define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 @@ -2877,7 +2878,6 @@ define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 @@ -2931,6 +2931,7 @@ define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 @@ -2941,7 +2942,6 @@ define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB11_4 @@ -2995,6 +2995,7 @@ define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 @@ -3005,7 +3006,6 @@ define inreg <32 x i32> @bitcast_v16f64_to_v32i32_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 @@ -4318,107 +4318,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -4435,258 +4334,362 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB12_2 ; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; VI-NEXT: v_mov_b32_e32 v55, v39 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 ; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v8 ; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v7 ; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v5 ; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v2 ; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v1 ; VI-NEXT: .LBB12_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB12_4 @@ -4749,273 +4752,272 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v32 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 ; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v8 ; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v7 ; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v5 ; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v2 ; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v1 ; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v48 -; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v57 -; VI-NEXT: v_or_b32_sdwa v2, v2, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v41 -; VI-NEXT: v_or_b32_sdwa v48, v53, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v41 +; VI-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v38, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v47 +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38 -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 -; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v61 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -5026,23 +5028,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -5053,23 +5054,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -5080,23 +5080,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -5107,10 +5106,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -5120,9 +5119,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -5133,10 +5132,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -5147,9 +5146,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -5160,10 +5159,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -5174,9 +5173,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -5187,21 +5186,23 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -5212,10 +5213,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -5226,9 +5227,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -5239,10 +5240,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -5253,13 +5254,15 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -5304,9 +5307,69 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -5317,6 +5380,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -5327,6 +5394,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -5337,6 +5408,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -5347,6 +5422,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -5357,6 +5436,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -5367,6 +5450,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -5377,6 +5464,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -5387,316 +5478,230 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; kill: killed $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; kill: killed $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(45) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(47) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: s_waitcnt vmcnt(48) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[19:20] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[17:18] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v1 ; GFX9-NEXT: .LBB12_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 -; GFX9-NEXT: s_waitcnt vmcnt(44) +; GFX9-NEXT: s_waitcnt vmcnt(46) ; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 @@ -5777,342 +5782,334 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v1 ; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v52 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v36 -; GFX9-NEXT: v_or_b32_sdwa v12, v12, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v50 -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v57 -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v56 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v42 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v38 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v47 -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v46 -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v55 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v54 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v41 -; GFX9-NEXT: v_or_b32_sdwa v33, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v38, v38, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 -; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v61, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -6122,11 +6119,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -6135,10 +6132,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -6148,11 +6145,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -6161,10 +6158,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -6174,11 +6171,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -6187,10 +6184,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -6200,11 +6197,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -6213,10 +6210,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -6226,11 +6223,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -6239,10 +6236,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -6252,11 +6249,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -6265,17 +6262,15 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -6553,17 +6548,17 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v161.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l @@ -6757,28 +6752,27 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 @@ -6864,6 +6858,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 @@ -6872,7 +6867,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 @@ -6881,18 +6875,19 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 @@ -6974,10 +6969,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: .LBB12_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_4 @@ -7111,56 +7105,52 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: .LBB12_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v63 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v61 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v66 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v67, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v65 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v61 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v62 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v67, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v59 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v58 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v57 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v56 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v47 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v46 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v45 @@ -7169,22 +7159,26 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v64 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v65, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v44 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v64 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v65, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v42 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v41 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v40 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v183 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 @@ -7414,27 +7408,26 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:84 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -7487,28 +7480,28 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v21, s68, 20 ; SI-NEXT: v_mov_b32_e32 v20, s16 ; SI-NEXT: v_writelane_b32 v21, s69, 21 -; SI-NEXT: v_readfirstlane_b32 s56, v20 +; SI-NEXT: v_readfirstlane_b32 s88, v20 ; SI-NEXT: v_mov_b32_e32 v20, s17 ; SI-NEXT: v_writelane_b32 v21, s70, 22 -; SI-NEXT: v_readfirstlane_b32 s57, v20 +; SI-NEXT: v_readfirstlane_b32 s89, v20 ; SI-NEXT: v_mov_b32_e32 v20, s18 ; SI-NEXT: v_writelane_b32 v21, s71, 23 -; SI-NEXT: v_readfirstlane_b32 s46, v20 +; SI-NEXT: v_readfirstlane_b32 s74, v20 ; SI-NEXT: v_mov_b32_e32 v20, s19 ; SI-NEXT: v_writelane_b32 v21, s80, 24 -; SI-NEXT: v_readfirstlane_b32 s47, v20 +; SI-NEXT: v_readfirstlane_b32 s75, v20 ; SI-NEXT: v_mov_b32_e32 v20, s20 ; SI-NEXT: v_writelane_b32 v21, s81, 25 -; SI-NEXT: v_readfirstlane_b32 s44, v20 +; SI-NEXT: v_readfirstlane_b32 s60, v20 ; SI-NEXT: v_mov_b32_e32 v20, s21 ; SI-NEXT: v_writelane_b32 v21, s82, 26 -; SI-NEXT: v_readfirstlane_b32 s45, v20 +; SI-NEXT: v_readfirstlane_b32 s61, v20 ; SI-NEXT: v_mov_b32_e32 v20, s22 ; SI-NEXT: v_writelane_b32 v21, s83, 27 -; SI-NEXT: v_readfirstlane_b32 s42, v20 +; SI-NEXT: v_readfirstlane_b32 s44, v20 ; SI-NEXT: v_mov_b32_e32 v20, s23 ; SI-NEXT: v_writelane_b32 v21, s84, 28 -; SI-NEXT: v_readfirstlane_b32 s43, v20 +; SI-NEXT: v_readfirstlane_b32 s45, v20 ; SI-NEXT: v_mov_b32_e32 v20, s24 ; SI-NEXT: v_writelane_b32 v21, s85, 29 ; SI-NEXT: v_readfirstlane_b32 s40, v20 @@ -7552,642 +7545,644 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s26, s5, 24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v23, s26, 42 -; SI-NEXT: s_lshr_b32 s26, s5, 16 -; SI-NEXT: v_writelane_b32 v23, s26, 43 -; SI-NEXT: s_lshr_b32 s26, s5, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 44 -; SI-NEXT: s_lshr_b32 s26, s7, 24 -; SI-NEXT: v_writelane_b32 v23, s26, 45 -; SI-NEXT: s_lshr_b32 s26, s7, 16 -; SI-NEXT: v_writelane_b32 v23, s26, 46 -; SI-NEXT: s_lshr_b32 s26, s7, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 47 -; SI-NEXT: s_lshr_b32 s26, s9, 16 -; SI-NEXT: v_writelane_b32 v23, s26, 48 -; SI-NEXT: s_lshr_b32 s26, s9, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 49 -; SI-NEXT: s_lshr_b32 s26, s11, 24 -; SI-NEXT: v_writelane_b32 v23, s26, 50 -; SI-NEXT: s_lshr_b32 s26, s11, 16 -; SI-NEXT: v_writelane_b32 v23, s26, 51 -; SI-NEXT: s_lshr_b32 s26, s11, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 52 -; SI-NEXT: s_lshr_b32 s26, s13, 24 -; SI-NEXT: v_writelane_b32 v23, s26, 53 -; SI-NEXT: s_lshr_b32 s26, s13, 16 -; SI-NEXT: v_writelane_b32 v23, s26, 54 -; SI-NEXT: s_lshr_b32 s26, s13, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 55 -; SI-NEXT: s_lshr_b32 s26, s15, 24 -; SI-NEXT: v_writelane_b32 v23, s26, 56 -; SI-NEXT: s_lshr_b32 s26, s15, 16 -; SI-NEXT: v_writelane_b32 v23, s26, 57 -; SI-NEXT: s_lshr_b32 s26, s15, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 58 -; SI-NEXT: s_lshr_b32 s26, s17, 24 -; SI-NEXT: v_writelane_b32 v23, s26, 59 -; SI-NEXT: s_lshr_b32 s26, s17, 16 -; SI-NEXT: v_writelane_b32 v23, s26, 60 -; SI-NEXT: s_lshr_b32 s26, s17, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 61 -; SI-NEXT: s_lshr_b32 s26, s19, 24 -; SI-NEXT: v_writelane_b32 v23, s26, 62 -; SI-NEXT: s_lshr_b32 s26, s19, 16 -; SI-NEXT: v_writelane_b32 v23, s26, 63 -; SI-NEXT: s_lshr_b32 s26, s19, 8 -; SI-NEXT: v_writelane_b32 v22, s26, 0 -; SI-NEXT: s_lshr_b32 s26, s21, 24 -; SI-NEXT: v_writelane_b32 v22, s26, 1 -; SI-NEXT: s_lshr_b32 s26, s21, 16 -; SI-NEXT: v_writelane_b32 v22, s26, 2 -; SI-NEXT: s_lshr_b32 s26, s21, 8 -; SI-NEXT: v_writelane_b32 v22, s26, 3 -; SI-NEXT: s_lshr_b32 s26, s23, 24 -; SI-NEXT: v_writelane_b32 v22, s26, 4 ; SI-NEXT: s_lshr_b32 s26, s23, 16 -; SI-NEXT: v_writelane_b32 v22, s26, 5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v22, s26, 0 ; SI-NEXT: s_lshr_b32 s26, s23, 8 -; SI-NEXT: v_writelane_b32 v22, s26, 6 +; SI-NEXT: v_writelane_b32 v22, s26, 1 ; SI-NEXT: s_lshr_b32 s26, s25, 24 -; SI-NEXT: v_writelane_b32 v22, s26, 7 +; SI-NEXT: v_writelane_b32 v22, s26, 2 ; SI-NEXT: s_lshr_b32 s26, s25, 16 -; SI-NEXT: v_writelane_b32 v22, s26, 8 +; SI-NEXT: v_writelane_b32 v22, s26, 3 ; SI-NEXT: s_lshr_b32 s26, s25, 8 -; SI-NEXT: v_writelane_b32 v22, s26, 9 -; SI-NEXT: s_lshr_b32 s26, s41, 24 -; SI-NEXT: v_writelane_b32 v22, s26, 10 -; SI-NEXT: s_lshr_b32 s26, s41, 16 -; SI-NEXT: v_writelane_b32 v22, s26, 11 -; SI-NEXT: s_lshr_b32 s26, s41, 8 -; SI-NEXT: v_writelane_b32 v22, s26, 12 -; SI-NEXT: s_lshr_b32 s26, s43, 24 -; SI-NEXT: v_writelane_b32 v22, s26, 13 -; SI-NEXT: s_lshr_b32 s26, s43, 16 -; SI-NEXT: v_writelane_b32 v22, s26, 14 -; SI-NEXT: s_lshr_b32 s26, s43, 8 -; SI-NEXT: v_writelane_b32 v22, s26, 15 -; SI-NEXT: s_lshr_b32 s26, s45, 24 -; SI-NEXT: v_writelane_b32 v22, s26, 16 -; SI-NEXT: s_lshr_b32 s26, s45, 16 -; SI-NEXT: v_writelane_b32 v22, s26, 17 +; SI-NEXT: v_writelane_b32 v22, s26, 4 ; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v23, s26, 62 +; SI-NEXT: v_writelane_b32 v23, s27, 63 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 60 +; SI-NEXT: v_writelane_b32 v23, s27, 61 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 58 +; SI-NEXT: v_writelane_b32 v23, s27, 59 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 56 +; SI-NEXT: v_writelane_b32 v23, s27, 57 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 54 +; SI-NEXT: v_writelane_b32 v23, s27, 55 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 52 +; SI-NEXT: v_writelane_b32 v23, s27, 53 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 50 +; SI-NEXT: v_writelane_b32 v23, s27, 51 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 48 +; SI-NEXT: v_writelane_b32 v23, s27, 49 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v23, s26, 46 +; SI-NEXT: v_writelane_b32 v23, s27, 47 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 44 +; SI-NEXT: v_writelane_b32 v23, s27, 45 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 42 +; SI-NEXT: v_writelane_b32 v23, s27, 43 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 40 ; SI-NEXT: v_writelane_b32 v23, s27, 41 -; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 38 ; SI-NEXT: v_writelane_b32 v23, s27, 39 -; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 36 ; SI-NEXT: v_writelane_b32 v23, s27, 37 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 34 ; SI-NEXT: v_writelane_b32 v23, s27, 35 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 32 ; SI-NEXT: v_writelane_b32 v23, s27, 33 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 30 ; SI-NEXT: v_writelane_b32 v23, s27, 31 -; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 28 ; SI-NEXT: v_writelane_b32 v23, s27, 29 -; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 26 ; SI-NEXT: v_writelane_b32 v23, s27, 27 -; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 24 ; SI-NEXT: v_writelane_b32 v23, s27, 25 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 22 ; SI-NEXT: v_writelane_b32 v23, s27, 23 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 20 ; SI-NEXT: v_writelane_b32 v23, s27, 21 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 18 ; SI-NEXT: v_writelane_b32 v23, s27, 19 -; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 16 ; SI-NEXT: v_writelane_b32 v23, s27, 17 -; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[20:21], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 14 ; SI-NEXT: v_writelane_b32 v23, s27, 15 -; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[20:21], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 12 ; SI-NEXT: v_writelane_b32 v23, s27, 13 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[20:21], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 10 +; SI-NEXT: s_lshr_b32 s56, s5, 16 ; SI-NEXT: v_writelane_b32 v23, s27, 11 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 +; SI-NEXT: s_lshr_b32 s58, s5, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 8 +; SI-NEXT: s_mov_b32 s47, s56 +; SI-NEXT: s_lshr_b64 s[56:57], s[40:41], 8 +; SI-NEXT: s_lshr_b32 s62, s7, 24 ; SI-NEXT: v_writelane_b32 v23, s27, 9 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[22:23], 16 +; SI-NEXT: s_mov_b32 s57, s58 +; SI-NEXT: s_lshr_b64 s[58:59], s[44:45], 24 +; SI-NEXT: s_lshr_b32 s72, s7, 16 ; SI-NEXT: v_writelane_b32 v23, s26, 6 +; SI-NEXT: s_mov_b32 s59, s62 +; SI-NEXT: s_lshr_b64 s[62:63], s[44:45], 16 +; SI-NEXT: s_lshr_b32 s76, s7, 8 ; SI-NEXT: v_writelane_b32 v23, s27, 7 -; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[22:23], 8 +; SI-NEXT: s_mov_b32 s63, s72 +; SI-NEXT: s_lshr_b64 s[72:73], s[44:45], 8 +; SI-NEXT: s_lshr_b32 s78, s9, 24 ; SI-NEXT: v_writelane_b32 v23, s26, 4 +; SI-NEXT: s_mov_b32 s73, s76 +; SI-NEXT: s_lshr_b64 s[76:77], s[60:61], 24 +; SI-NEXT: s_lshr_b32 s90, s9, 16 ; SI-NEXT: v_writelane_b32 v23, s27, 5 -; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 24 +; SI-NEXT: s_mov_b32 s77, s78 +; SI-NEXT: s_lshr_b64 s[78:79], s[60:61], 16 +; SI-NEXT: s_lshr_b32 s92, s9, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 2 +; SI-NEXT: s_mov_b32 s79, s90 +; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 8 +; SI-NEXT: s_lshr_b32 s94, s11, 24 +; SI-NEXT: s_lshr_b32 s36, s13, 24 ; SI-NEXT: v_writelane_b32 v23, s27, 3 -; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 8 -; SI-NEXT: v_writelane_b32 v23, s26, 0 -; SI-NEXT: s_lshr_b32 s50, s9, 24 -; SI-NEXT: s_lshr_b32 s51, s45, 8 -; SI-NEXT: s_lshr_b32 s48, s47, 24 -; SI-NEXT: s_lshr_b32 s52, s47, 16 -; SI-NEXT: s_lshr_b32 s53, s47, 8 -; SI-NEXT: s_lshr_b32 s54, s57, 24 -; SI-NEXT: s_lshr_b32 s55, s57, 16 -; SI-NEXT: s_lshr_b32 s49, s57, 8 -; SI-NEXT: v_writelane_b32 v23, s27, 1 -; SI-NEXT: s_lshr_b64 s[64:65], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[66:67], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[80:81], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[84:85], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[86:87], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[96:97], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[98:99], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[24:25], 8 +; SI-NEXT: s_mov_b32 s91, s92 +; SI-NEXT: s_lshr_b64 s[92:93], s[74:75], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[88:89], 24 +; SI-NEXT: s_lshr_b32 vcc_lo, s11, 16 +; SI-NEXT: s_lshr_b32 vcc_hi, s11, 8 +; SI-NEXT: s_lshr_b32 s38, s13, 16 +; SI-NEXT: s_lshr_b32 s82, s15, 24 +; SI-NEXT: s_lshr_b32 s84, s15, 8 +; SI-NEXT: s_lshr_b32 s86, s17, 16 +; SI-NEXT: s_lshr_b32 s96, s19, 24 +; SI-NEXT: s_lshr_b32 s98, s21, 24 +; SI-NEXT: s_lshr_b32 s28, s23, 24 +; SI-NEXT: v_writelane_b32 v23, s42, 0 +; SI-NEXT: s_mov_b32 s93, s94 +; SI-NEXT: s_lshr_b64 s[94:95], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[74:75], 8 +; SI-NEXT: s_mov_b32 s35, s36 +; SI-NEXT: s_lshr_b64 s[36:37], s[88:89], 16 +; SI-NEXT: s_lshr_b32 s51, s5, 24 +; SI-NEXT: s_lshr_b32 s39, s13, 8 +; SI-NEXT: s_lshr_b32 s83, s15, 16 +; SI-NEXT: s_lshr_b32 s85, s17, 24 +; SI-NEXT: s_lshr_b32 s87, s17, 8 +; SI-NEXT: s_lshr_b32 s97, s19, 16 +; SI-NEXT: s_lshr_b32 s65, s19, 8 +; SI-NEXT: s_lshr_b32 s99, s21, 16 +; SI-NEXT: s_lshr_b32 s64, s21, 8 +; SI-NEXT: s_lshr_b32 s81, s41, 24 +; SI-NEXT: s_lshr_b32 s55, s41, 16 +; SI-NEXT: s_lshr_b32 s50, s41, 8 +; SI-NEXT: s_lshr_b32 s48, s45, 24 +; SI-NEXT: s_lshr_b32 s66, s45, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 8 +; SI-NEXT: s_lshr_b32 s68, s61, 24 +; SI-NEXT: s_lshr_b32 s69, s61, 16 +; SI-NEXT: s_lshr_b32 s49, s61, 8 +; SI-NEXT: s_lshr_b32 s52, s75, 24 +; SI-NEXT: s_lshr_b32 s53, s75, 16 +; SI-NEXT: s_lshr_b32 s70, s75, 8 +; SI-NEXT: s_lshr_b32 s71, s89, 24 +; SI-NEXT: s_lshr_b32 s54, s89, 16 +; SI-NEXT: s_lshr_b32 s80, s89, 8 ; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[58:59], s[40:41], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[44:45], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[46:47], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[46:47], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[56:57], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[56:57], 8 +; SI-NEXT: v_writelane_b32 v23, s43, 1 +; SI-NEXT: s_lshr_b64 s[42:43], s[40:41], 24 +; SI-NEXT: s_mov_b32 s95, vcc_lo +; SI-NEXT: s_mov_b32 s31, vcc_hi +; SI-NEXT: s_mov_b32 s37, s38 +; SI-NEXT: s_mov_b32 s38, s82 +; SI-NEXT: s_mov_b32 s82, s84 +; SI-NEXT: s_mov_b32 s84, s86 +; SI-NEXT: s_mov_b32 s86, s96 +; SI-NEXT: s_mov_b32 s96, s98 +; SI-NEXT: s_mov_b32 s98, s28 +; SI-NEXT: s_lshr_b64 s[28:29], s[88:89], 8 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true ; SI-NEXT: s_add_i32 s5, s5, 3 ; SI-NEXT: s_add_i32 s4, s4, 3 ; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 -; SI-NEXT: v_writelane_b32 v23, s26, 40 -; SI-NEXT: v_writelane_b32 v23, s27, 41 +; SI-NEXT: v_writelane_b32 v23, s26, 62 +; SI-NEXT: v_writelane_b32 v23, s27, 63 ; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 -; SI-NEXT: v_writelane_b32 v23, s26, 38 -; SI-NEXT: v_writelane_b32 v23, s27, 39 -; SI-NEXT: s_lshr_b32 s26, s5, 24 -; SI-NEXT: v_writelane_b32 v23, s26, 42 -; SI-NEXT: s_lshr_b32 s26, s5, 16 +; SI-NEXT: v_writelane_b32 v23, s26, 60 +; SI-NEXT: v_writelane_b32 v23, s27, 61 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 8 ; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: v_writelane_b32 v23, s26, 43 -; SI-NEXT: s_lshr_b32 s26, s5, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 44 -; SI-NEXT: s_lshr_b32 s26, s7, 24 -; SI-NEXT: v_writelane_b32 v23, s26, 45 -; SI-NEXT: s_lshr_b32 s26, s7, 16 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_writelane_b32 v23, s26, 58 +; SI-NEXT: v_writelane_b32 v23, s27, 59 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 56 +; SI-NEXT: v_writelane_b32 v23, s27, 57 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 54 +; SI-NEXT: v_writelane_b32 v23, s27, 55 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: v_writelane_b32 v23, s26, 46 -; SI-NEXT: s_lshr_b32 s26, s7, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 47 -; SI-NEXT: s_lshr_b32 s26, s9, 16 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: v_writelane_b32 v23, s26, 48 -; SI-NEXT: s_lshr_b32 s26, s9, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 49 -; SI-NEXT: s_lshr_b32 s26, s11, 24 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: v_writelane_b32 v23, s26, 52 +; SI-NEXT: v_writelane_b32 v23, s27, 53 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 50 -; SI-NEXT: s_lshr_b32 s26, s11, 16 +; SI-NEXT: v_writelane_b32 v23, s27, 51 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 48 +; SI-NEXT: v_writelane_b32 v23, s27, 49 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 8 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: v_writelane_b32 v23, s26, 46 +; SI-NEXT: v_writelane_b32 v23, s27, 47 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 44 +; SI-NEXT: v_writelane_b32 v23, s27, 45 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 42 +; SI-NEXT: v_writelane_b32 v23, s27, 43 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 ; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_writelane_b32 v23, s26, 51 -; SI-NEXT: s_lshr_b32 s26, s11, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 52 -; SI-NEXT: s_lshr_b32 s26, s13, 24 -; SI-NEXT: v_writelane_b32 v23, s26, 53 -; SI-NEXT: s_lshr_b32 s26, s13, 16 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: v_writelane_b32 v23, s26, 54 -; SI-NEXT: s_lshr_b32 s26, s13, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 55 -; SI-NEXT: s_lshr_b32 s26, s15, 24 -; SI-NEXT: v_writelane_b32 v23, s26, 56 -; SI-NEXT: s_lshr_b32 s26, s15, 16 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_writelane_b32 v23, s26, 57 -; SI-NEXT: s_lshr_b32 s26, s15, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 58 -; SI-NEXT: s_lshr_b32 s26, s17, 24 -; SI-NEXT: v_writelane_b32 v23, s26, 59 -; SI-NEXT: s_lshr_b32 s26, s17, 16 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_writelane_b32 v23, s26, 60 -; SI-NEXT: s_lshr_b32 s26, s17, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 61 -; SI-NEXT: s_lshr_b32 s26, s19, 24 -; SI-NEXT: v_writelane_b32 v23, s26, 62 -; SI-NEXT: s_lshr_b32 s26, s19, 16 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_writelane_b32 v23, s26, 63 -; SI-NEXT: s_lshr_b32 s26, s19, 8 -; SI-NEXT: v_writelane_b32 v22, s26, 0 -; SI-NEXT: s_lshr_b32 s26, s21, 24 -; SI-NEXT: v_writelane_b32 v22, s26, 1 -; SI-NEXT: s_lshr_b32 s26, s21, 16 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_writelane_b32 v22, s26, 2 -; SI-NEXT: s_lshr_b32 s26, s21, 8 -; SI-NEXT: v_writelane_b32 v22, s26, 3 -; SI-NEXT: s_lshr_b32 s26, s23, 24 -; SI-NEXT: v_writelane_b32 v22, s26, 4 -; SI-NEXT: s_lshr_b32 s26, s23, 16 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: v_writelane_b32 v22, s26, 5 -; SI-NEXT: s_lshr_b32 s26, s23, 8 -; SI-NEXT: v_writelane_b32 v22, s26, 6 -; SI-NEXT: s_lshr_b32 s26, s25, 24 -; SI-NEXT: v_writelane_b32 v22, s26, 7 -; SI-NEXT: s_lshr_b32 s26, s25, 16 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: v_writelane_b32 v22, s26, 8 -; SI-NEXT: s_lshr_b32 s26, s25, 8 -; SI-NEXT: v_writelane_b32 v22, s26, 9 -; SI-NEXT: s_lshr_b32 s26, s41, 24 -; SI-NEXT: v_writelane_b32 v22, s26, 10 -; SI-NEXT: s_lshr_b32 s26, s41, 16 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: v_writelane_b32 v22, s26, 11 -; SI-NEXT: s_lshr_b32 s26, s41, 8 -; SI-NEXT: v_writelane_b32 v22, s26, 12 -; SI-NEXT: s_lshr_b32 s26, s43, 24 -; SI-NEXT: v_writelane_b32 v22, s26, 13 -; SI-NEXT: s_lshr_b32 s26, s43, 16 -; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: v_writelane_b32 v22, s26, 14 -; SI-NEXT: s_lshr_b32 s26, s43, 8 -; SI-NEXT: v_writelane_b32 v22, s26, 15 -; SI-NEXT: s_lshr_b32 s26, s45, 24 -; SI-NEXT: v_writelane_b32 v22, s26, 16 -; SI-NEXT: s_lshr_b32 s26, s45, 16 -; SI-NEXT: v_writelane_b32 v22, s26, 17 -; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 8 -; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: v_writelane_b32 v23, s26, 40 +; SI-NEXT: v_writelane_b32 v23, s27, 41 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 38 +; SI-NEXT: v_writelane_b32 v23, s27, 39 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 36 ; SI-NEXT: v_writelane_b32 v23, s27, 37 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 8 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: v_writelane_b32 v23, s26, 34 ; SI-NEXT: v_writelane_b32 v23, s27, 35 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 32 ; SI-NEXT: v_writelane_b32 v23, s27, 33 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 -; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 30 ; SI-NEXT: v_writelane_b32 v23, s27, 31 -; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 8 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_writelane_b32 v23, s26, 28 ; SI-NEXT: v_writelane_b32 v23, s27, 29 -; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 26 ; SI-NEXT: v_writelane_b32 v23, s27, 27 -; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 8 -; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 24 ; SI-NEXT: v_writelane_b32 v23, s27, 25 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 8 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_writelane_b32 v23, s26, 22 ; SI-NEXT: v_writelane_b32 v23, s27, 23 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 20 ; SI-NEXT: v_writelane_b32 v23, s27, 21 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 -; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 18 ; SI-NEXT: v_writelane_b32 v23, s27, 19 -; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 8 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: v_writelane_b32 v23, s26, 16 ; SI-NEXT: v_writelane_b32 v23, s27, 17 -; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[20:21], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 14 ; SI-NEXT: v_writelane_b32 v23, s27, 15 -; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 8 -; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[20:21], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 12 ; SI-NEXT: v_writelane_b32 v23, s27, 13 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[20:21], 8 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_writelane_b32 v23, s26, 10 ; SI-NEXT: v_writelane_b32 v23, s27, 11 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[22:23], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 8 ; SI-NEXT: v_writelane_b32 v23, s27, 9 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 8 -; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[22:23], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 6 ; SI-NEXT: v_writelane_b32 v23, s27, 7 -; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[22:23], 8 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_writelane_b32 v23, s26, 4 ; SI-NEXT: v_writelane_b32 v23, s27, 5 -; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 2 ; SI-NEXT: v_writelane_b32 v23, s27, 3 -; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 8 -; SI-NEXT: s_add_i32 s57, s57, 3 -; SI-NEXT: s_add_i32 s56, s56, 3 -; SI-NEXT: s_add_i32 s47, s47, 3 -; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 +; SI-NEXT: s_lshr_b32 s27, s23, 16 +; SI-NEXT: v_writelane_b32 v22, s27, 0 +; SI-NEXT: s_lshr_b32 s27, s23, 8 +; SI-NEXT: s_add_i32 s89, s89, 3 +; SI-NEXT: s_add_i32 s88, s88, 3 +; SI-NEXT: s_add_i32 s75, s75, 3 +; SI-NEXT: s_add_i32 s74, s74, 3 +; SI-NEXT: s_add_i32 s61, s61, 3 +; SI-NEXT: s_add_i32 s60, s60, 3 +; SI-NEXT: s_add_i32 s45, s45, 3 ; SI-NEXT: s_add_i32 s44, s44, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_writelane_b32 v23, s26, 0 -; SI-NEXT: s_lshr_b32 s50, s9, 24 -; SI-NEXT: s_lshr_b32 s51, s45, 8 -; SI-NEXT: s_lshr_b32 s48, s47, 24 -; SI-NEXT: s_lshr_b32 s52, s47, 16 -; SI-NEXT: s_lshr_b32 s53, s47, 8 -; SI-NEXT: s_lshr_b32 s54, s57, 24 -; SI-NEXT: s_lshr_b32 s55, s57, 16 -; SI-NEXT: s_lshr_b32 s49, s57, 8 -; SI-NEXT: v_writelane_b32 v23, s27, 1 -; SI-NEXT: s_lshr_b64 s[64:65], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[66:67], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[80:81], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[84:85], s[22:23], 24 -; SI-NEXT: s_lshr_b64 s[86:87], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[96:97], s[22:23], 8 -; SI-NEXT: s_lshr_b64 s[98:99], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 ; SI-NEXT: s_lshr_b64 s[28:29], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[58:59], s[40:41], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[44:45], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[46:47], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[46:47], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[56:57], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[56:57], 8 +; SI-NEXT: v_writelane_b32 v22, s27, 1 +; SI-NEXT: s_lshr_b32 s27, s25, 24 +; SI-NEXT: v_writelane_b32 v23, s28, 0 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[62:63], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[76:77], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[74:75], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[74:75], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[88:89], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[88:89], 16 +; SI-NEXT: v_writelane_b32 v22, s27, 2 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: v_writelane_b32 v23, s29, 1 +; SI-NEXT: s_lshr_b64 s[42:43], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[88:89], 8 +; SI-NEXT: s_lshr_b32 s51, s5, 24 +; SI-NEXT: s_lshr_b32 s47, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s5, 8 +; SI-NEXT: s_lshr_b32 s59, s7, 24 +; SI-NEXT: s_lshr_b32 s63, s7, 16 +; SI-NEXT: s_lshr_b32 s73, s7, 8 +; SI-NEXT: s_lshr_b32 s77, s9, 24 +; SI-NEXT: s_lshr_b32 s79, s9, 16 +; SI-NEXT: s_lshr_b32 s91, s9, 8 +; SI-NEXT: s_lshr_b32 s93, s11, 24 +; SI-NEXT: s_lshr_b32 s95, s11, 16 +; SI-NEXT: s_lshr_b32 s31, s11, 8 +; SI-NEXT: s_lshr_b32 s35, s13, 24 +; SI-NEXT: s_lshr_b32 s37, s13, 16 +; SI-NEXT: s_lshr_b32 s39, s13, 8 +; SI-NEXT: s_lshr_b32 s38, s15, 24 +; SI-NEXT: s_lshr_b32 s83, s15, 16 +; SI-NEXT: s_lshr_b32 s82, s15, 8 +; SI-NEXT: s_lshr_b32 s85, s17, 24 +; SI-NEXT: s_lshr_b32 s84, s17, 16 +; SI-NEXT: s_lshr_b32 s87, s17, 8 +; SI-NEXT: s_lshr_b32 s86, s19, 24 +; SI-NEXT: s_lshr_b32 s97, s19, 16 +; SI-NEXT: s_lshr_b32 s65, s19, 8 +; SI-NEXT: s_lshr_b32 s96, s21, 24 +; SI-NEXT: s_lshr_b32 s99, s21, 16 +; SI-NEXT: s_lshr_b32 s64, s21, 8 +; SI-NEXT: s_lshr_b32 s98, s23, 24 +; SI-NEXT: v_writelane_b32 v22, s27, 3 +; SI-NEXT: s_lshr_b32 s27, s25, 8 +; SI-NEXT: s_lshr_b32 s81, s41, 24 +; SI-NEXT: s_lshr_b32 s55, s41, 16 +; SI-NEXT: s_lshr_b32 s50, s41, 8 +; SI-NEXT: s_lshr_b32 s48, s45, 24 +; SI-NEXT: s_lshr_b32 s66, s45, 16 +; SI-NEXT: s_lshr_b32 s67, s45, 8 +; SI-NEXT: s_lshr_b32 s68, s61, 24 +; SI-NEXT: s_lshr_b32 s69, s61, 16 +; SI-NEXT: s_lshr_b32 s49, s61, 8 +; SI-NEXT: s_lshr_b32 s52, s75, 24 +; SI-NEXT: s_lshr_b32 s53, s75, 16 +; SI-NEXT: s_lshr_b32 s70, s75, 8 +; SI-NEXT: s_lshr_b32 s71, s89, 24 +; SI-NEXT: s_lshr_b32 s54, s89, 16 +; SI-NEXT: s_lshr_b32 s80, s89, 8 +; SI-NEXT: v_writelane_b32 v22, s27, 4 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s27, s38, 8 -; SI-NEXT: s_and_b32 s29, s56, 0xff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s88, 0xff +; SI-NEXT: s_lshl_b32 s29, s28, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: s_and_b32 s29, s36, 0xff -; SI-NEXT: s_lshl_b32 s56, s34, 24 ; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: s_or_b32 s29, s56, s29 +; SI-NEXT: s_lshl_b32 s43, s34, 24 +; SI-NEXT: s_or_b32 s29, s43, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v1, s27 -; SI-NEXT: s_and_b32 s27, s57, 0xff -; SI-NEXT: s_lshl_b32 s29, s49, 8 +; SI-NEXT: s_and_b32 s27, s89, 0xff +; SI-NEXT: s_lshl_b32 s29, s80, 8 ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: s_and_b32 s29, s55, 0xff +; SI-NEXT: s_and_b32 s29, s54, 0xff ; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: s_lshl_b32 s56, s54, 24 -; SI-NEXT: s_or_b32 s29, s56, s29 +; SI-NEXT: s_lshl_b32 s43, s71, 24 +; SI-NEXT: s_or_b32 s29, s43, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s30, 8 -; SI-NEXT: s_and_b32 s29, s46, 0xff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s74, 0xff +; SI-NEXT: s_lshl_b32 s29, s30, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: s_and_b32 s29, s94, 0xff -; SI-NEXT: s_lshl_b32 s46, s92, 24 ; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: s_or_b32 s29, s46, s29 +; SI-NEXT: s_lshl_b32 s43, s92, 24 +; SI-NEXT: s_or_b32 s29, s43, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: s_and_b32 s27, s47, 0xff -; SI-NEXT: s_lshl_b32 s29, s53, 8 +; SI-NEXT: s_and_b32 s27, s75, 0xff +; SI-NEXT: s_lshl_b32 s29, s70, 8 ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: s_and_b32 s29, s52, 0xff +; SI-NEXT: s_and_b32 s29, s53, 0xff ; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: s_lshl_b32 s46, s48, 24 -; SI-NEXT: s_or_b32 s29, s46, s29 +; SI-NEXT: s_lshl_b32 s43, s52, 24 +; SI-NEXT: s_or_b32 s29, s43, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: s_lshl_b32 s27, s90, 8 -; SI-NEXT: s_and_b32 s29, s44, 0xff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: s_and_b32 s29, s88, 0xff -; SI-NEXT: s_lshl_b32 s44, s78, 24 +; SI-NEXT: s_and_b32 s27, s60, 0xff +; SI-NEXT: s_lshl_b32 s29, s90, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: s_and_b32 s29, s78, 0xff ; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: s_or_b32 s29, s44, s29 +; SI-NEXT: s_lshl_b32 s43, s76, 24 +; SI-NEXT: s_or_b32 s29, s43, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v5, s27 -; SI-NEXT: s_and_b32 s27, s45, 0xff -; SI-NEXT: s_lshl_b32 s29, s51, 8 +; SI-NEXT: s_and_b32 s27, s61, 0xff +; SI-NEXT: s_lshl_b32 s29, s49, 8 ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_readlane_b32 s29, v22, 17 -; SI-NEXT: s_and_b32 s29, s29, 0xff -; SI-NEXT: v_readlane_b32 s44, v22, 16 +; SI-NEXT: s_and_b32 s29, s69, 0xff ; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: s_lshl_b32 s44, s44, 24 -; SI-NEXT: s_or_b32 s29, s44, s29 +; SI-NEXT: s_lshl_b32 s43, s68, 24 +; SI-NEXT: s_or_b32 s29, s43, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v6, s27 -; SI-NEXT: s_lshl_b32 s27, s76, 8 -; SI-NEXT: s_and_b32 s29, s42, 0xff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: s_and_b32 s29, s74, 0xff -; SI-NEXT: s_lshl_b32 s42, s72, 24 +; SI-NEXT: s_and_b32 s27, s44, 0xff +; SI-NEXT: s_lshl_b32 s29, s72, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: s_and_b32 s29, s62, 0xff ; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: s_or_b32 s29, s42, s29 +; SI-NEXT: s_lshl_b32 s43, s58, 24 +; SI-NEXT: s_or_b32 s29, s43, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_readlane_b32 s29, v22, 15 ; SI-NEXT: v_mov_b32_e32 v7, s27 -; SI-NEXT: s_and_b32 s27, s43, 0xff -; SI-NEXT: s_lshl_b32 s29, s29, 8 +; SI-NEXT: s_and_b32 s27, s45, 0xff +; SI-NEXT: s_lshl_b32 s29, s67, 8 ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_readlane_b32 s29, v22, 14 -; SI-NEXT: s_and_b32 s29, s29, 0xff -; SI-NEXT: v_readlane_b32 s42, v22, 13 +; SI-NEXT: s_and_b32 s29, s66, 0xff ; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: s_lshl_b32 s42, s42, 24 -; SI-NEXT: s_or_b32 s29, s42, s29 +; SI-NEXT: s_lshl_b32 s43, s48, 24 +; SI-NEXT: s_or_b32 s29, s43, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v8, s27 -; SI-NEXT: s_lshl_b32 s27, s62, 8 -; SI-NEXT: s_and_b32 s29, s40, 0xff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: s_and_b32 s29, s60, 0xff -; SI-NEXT: s_lshl_b32 s40, s58, 24 +; SI-NEXT: s_and_b32 s27, s40, 0xff +; SI-NEXT: s_lshl_b32 s29, s56, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: s_and_b32 s29, s46, 0xff ; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_lshl_b32 s40, s42, 24 ; SI-NEXT: s_or_b32 s29, s40, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_readlane_b32 s29, v22, 12 ; SI-NEXT: v_mov_b32_e32 v9, s27 ; SI-NEXT: s_and_b32 s27, s41, 0xff -; SI-NEXT: s_lshl_b32 s29, s29, 8 +; SI-NEXT: s_lshl_b32 s29, s50, 8 ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_readlane_b32 s29, v22, 11 -; SI-NEXT: s_and_b32 s29, s29, 0xff -; SI-NEXT: v_readlane_b32 s40, v22, 10 +; SI-NEXT: s_and_b32 s29, s55, 0xff ; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: s_lshl_b32 s40, s40, 24 +; SI-NEXT: s_lshl_b32 s40, s81, 24 ; SI-NEXT: s_or_b32 s29, s40, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: v_readlane_b32 s28, v23, 0 ; SI-NEXT: v_mov_b32_e32 v10, s27 -; SI-NEXT: s_lshl_b32 s27, s28, 8 ; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_lshl_b32 s27, s28, 8 ; SI-NEXT: s_and_b32 s26, s26, 0xff +; SI-NEXT: v_readlane_b32 s28, v23, 2 ; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: s_lshl_b32 s27, s98, 24 ; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_lshl_b32 s27, s28, 24 ; SI-NEXT: s_or_b32 s26, s27, s26 ; SI-NEXT: s_and_b32 s24, s24, 0xffff ; SI-NEXT: s_or_b32 s24, s24, s26 ; SI-NEXT: v_mov_b32_e32 v11, s24 ; SI-NEXT: s_and_b32 s24, s25, 0xff -; SI-NEXT: v_readlane_b32 s25, v22, 9 +; SI-NEXT: v_readlane_b32 s25, v22, 4 ; SI-NEXT: s_lshl_b32 s25, s25, 8 ; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: v_readlane_b32 s25, v22, 8 +; SI-NEXT: v_readlane_b32 s25, v22, 3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: s_and_b32 s25, s25, 0xff -; SI-NEXT: v_readlane_b32 s26, v22, 7 +; SI-NEXT: v_readlane_b32 s26, v22, 2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; SI-NEXT: s_lshl_b32 s25, s25, 16 ; SI-NEXT: s_lshl_b32 s26, s26, 24 -; SI-NEXT: s_or_b32 s25, s26, s25 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s25, s26, s25 +; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 ; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: s_lshl_b32 s24, s96, 8 +; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: v_readlane_b32 s24, v23, 4 +; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 ; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_lshl_b32 s24, s24, 8 +; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: v_readlane_b32 s25, v23, 5 ; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: s_and_b32 s24, s86, 0xff -; SI-NEXT: s_lshl_b32 s25, s84, 24 +; SI-NEXT: v_readlane_b32 s24, v23, 6 +; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: v_readlane_b32 s25, v23, 7 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: v_readlane_b32 s26, v23, 8 +; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 ; SI-NEXT: s_lshl_b32 s24, s24, 16 -; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: s_lshl_b32 s25, s26, 24 +; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 ; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 ; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s22 ; SI-NEXT: s_and_b32 s22, s23, 0xff -; SI-NEXT: v_readlane_b32 s23, v22, 6 +; SI-NEXT: v_readlane_b32 s23, v22, 1 ; SI-NEXT: s_lshl_b32 s23, s23, 8 ; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: v_readlane_b32 s23, v22, 5 +; SI-NEXT: v_readlane_b32 s23, v22, 0 ; SI-NEXT: s_and_b32 s23, s23, 0xff -; SI-NEXT: v_readlane_b32 s24, v22, 4 ; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: s_lshl_b32 s24, s24, 24 -; SI-NEXT: s_or_b32 s23, s24, s23 +; SI-NEXT: s_lshl_b32 s24, s98, 24 ; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s23, s24, s23 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 ; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: s_lshl_b32 s22, s82, 8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: v_readlane_b32 s22, v23, 10 ; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s22, s22, 8 +; SI-NEXT: v_readlane_b32 s23, v23, 11 ; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: s_and_b32 s22, s80, 0xff -; SI-NEXT: s_lshl_b32 s23, s70, 24 +; SI-NEXT: v_readlane_b32 s22, v23, 12 +; SI-NEXT: v_readlane_b32 s23, v23, 13 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: v_readlane_b32 s24, v23, 14 ; SI-NEXT: s_lshl_b32 s22, s22, 16 -; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: s_lshl_b32 s23, s24, 24 ; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 ; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s20 ; SI-NEXT: s_and_b32 s20, s21, 0xff -; SI-NEXT: v_readlane_b32 s21, v22, 3 -; SI-NEXT: s_lshl_b32 s21, s21, 8 +; SI-NEXT: s_lshl_b32 s21, s64, 8 ; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: v_readlane_b32 s21, v22, 2 -; SI-NEXT: s_and_b32 s21, s21, 0xff -; SI-NEXT: v_readlane_b32 s22, v22, 1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_and_b32 s21, s99, 0xff ; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_lshl_b32 s22, s22, 24 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_lshl_b32 s22, s96, 24 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 ; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: s_lshl_b32 s20, s68, 8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s20 +; SI-NEXT: v_readlane_b32 s20, v23, 16 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_lshl_b32 s20, s20, 8 +; SI-NEXT: v_readlane_b32 s21, v23, 17 ; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: s_and_b32 s20, s66, 0xff -; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_lshl_b32 s21, s64, 24 +; SI-NEXT: v_readlane_b32 s20, v23, 18 +; SI-NEXT: v_readlane_b32 s21, v23, 19 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: v_readlane_b32 s22, v23, 20 ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_lshl_b32 s21, s22, 24 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: s_and_b32 s18, s19, 0xff -; SI-NEXT: v_readlane_b32 s19, v22, 0 -; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_lshl_b32 s19, s19, 8 -; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_lshl_b32 s19, s65, 8 ; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: v_readlane_b32 s19, v23, 63 -; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: v_readlane_b32 s20, v23, 62 -; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_and_b32 s19, s97, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s20, 24 -; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_lshl_b32 s20, s86, 24 ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_readlane_b32 s18, v23, 0 +; SI-NEXT: v_readlane_b32 s18, v23, 22 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 8 -; SI-NEXT: v_readlane_b32 s19, v23, 1 +; SI-NEXT: v_readlane_b32 s19, v23, 23 ; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: v_readlane_b32 s18, v23, 2 -; SI-NEXT: v_readlane_b32 s19, v23, 3 +; SI-NEXT: v_readlane_b32 s18, v23, 24 +; SI-NEXT: v_readlane_b32 s19, v23, 25 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: v_readlane_b32 s20, v23, 4 +; SI-NEXT: v_readlane_b32 s20, v23, 26 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s20, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff @@ -8198,14 +8193,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v23, 61 -; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_lshl_b32 s17, s87, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v23, 60 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v23, 59 +; SI-NEXT: s_and_b32 s17, s84, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s18, s85, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 @@ -8213,15 +8205,15 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: v_readlane_b32 s16, v23, 6 +; SI-NEXT: v_readlane_b32 s16, v23, 28 ; SI-NEXT: s_and_b32 s14, s14, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s17, v23, 7 +; SI-NEXT: v_readlane_b32 s17, v23, 29 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: v_readlane_b32 s16, v23, 8 -; SI-NEXT: v_readlane_b32 s17, v23, 9 +; SI-NEXT: v_readlane_b32 s16, v23, 30 +; SI-NEXT: v_readlane_b32 s17, v23, 31 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s18, v23, 10 +; SI-NEXT: v_readlane_b32 s18, v23, 32 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s18, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff @@ -8232,14 +8224,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xff -; SI-NEXT: v_readlane_b32 s15, v23, 58 -; SI-NEXT: s_lshl_b32 s15, s15, 8 +; SI-NEXT: s_lshl_b32 s15, s82, 8 ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_readlane_b32 s15, v23, 57 -; SI-NEXT: s_and_b32 s15, s15, 0xff -; SI-NEXT: v_readlane_b32 s16, v23, 56 +; SI-NEXT: s_and_b32 s15, s83, 0xff ; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: s_lshl_b32 s16, s16, 24 +; SI-NEXT: s_lshl_b32 s16, s38, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s15, s16, s15 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 @@ -8247,15 +8236,15 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: v_readlane_b32 s14, v23, 12 +; SI-NEXT: v_readlane_b32 s14, v23, 34 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 8 -; SI-NEXT: v_readlane_b32 s15, v23, 13 +; SI-NEXT: v_readlane_b32 s15, v23, 35 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: v_readlane_b32 s14, v23, 14 -; SI-NEXT: v_readlane_b32 s15, v23, 15 +; SI-NEXT: v_readlane_b32 s14, v23, 36 +; SI-NEXT: v_readlane_b32 s15, v23, 37 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s16, v23, 16 +; SI-NEXT: v_readlane_b32 s16, v23, 38 ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_lshl_b32 s15, s16, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff @@ -8266,14 +8255,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff -; SI-NEXT: v_readlane_b32 s13, v23, 55 -; SI-NEXT: s_lshl_b32 s13, s13, 8 +; SI-NEXT: s_lshl_b32 s13, s39, 8 ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: v_readlane_b32 s13, v23, 54 -; SI-NEXT: s_and_b32 s13, s13, 0xff -; SI-NEXT: v_readlane_b32 s14, v23, 53 +; SI-NEXT: s_and_b32 s13, s37, 0xff ; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: s_lshl_b32 s14, s14, 24 +; SI-NEXT: s_lshl_b32 s14, s35, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 @@ -8281,15 +8267,15 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: v_readlane_b32 s12, v23, 18 +; SI-NEXT: v_readlane_b32 s12, v23, 40 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 8 -; SI-NEXT: v_readlane_b32 s13, v23, 19 +; SI-NEXT: v_readlane_b32 s13, v23, 41 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: v_readlane_b32 s12, v23, 20 -; SI-NEXT: v_readlane_b32 s13, v23, 21 +; SI-NEXT: v_readlane_b32 s12, v23, 42 +; SI-NEXT: v_readlane_b32 s13, v23, 43 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s14, v23, 22 +; SI-NEXT: v_readlane_b32 s14, v23, 44 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_lshl_b32 s13, s14, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff @@ -8300,14 +8286,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v23, 52 -; SI-NEXT: s_lshl_b32 s11, s11, 8 +; SI-NEXT: s_lshl_b32 s11, s31, 8 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_readlane_b32 s11, v23, 51 -; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: v_readlane_b32 s12, v23, 50 +; SI-NEXT: s_and_b32 s11, s95, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s12, s12, 24 +; SI-NEXT: s_lshl_b32 s12, s93, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 @@ -8315,15 +8298,15 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_readlane_b32 s10, v23, 24 +; SI-NEXT: v_readlane_b32 s10, v23, 46 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: v_readlane_b32 s11, v23, 25 +; SI-NEXT: v_readlane_b32 s11, v23, 47 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: v_readlane_b32 s10, v23, 26 -; SI-NEXT: v_readlane_b32 s11, v23, 27 +; SI-NEXT: v_readlane_b32 s10, v23, 48 +; SI-NEXT: v_readlane_b32 s11, v23, 49 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s12, v23, 28 +; SI-NEXT: v_readlane_b32 s12, v23, 50 ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_lshl_b32 s11, s12, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff @@ -8334,13 +8317,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v23, 49 -; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: s_lshl_b32 s9, s91, 8 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_readlane_b32 s9, v23, 48 -; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_and_b32 s9, s79, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s10, s50, 24 +; SI-NEXT: s_lshl_b32 s10, s77, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 @@ -8348,15 +8329,15 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_readlane_b32 s8, v23, 30 +; SI-NEXT: v_readlane_b32 s8, v23, 52 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v23, 31 +; SI-NEXT: v_readlane_b32 s9, v23, 53 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: v_readlane_b32 s8, v23, 32 -; SI-NEXT: v_readlane_b32 s9, v23, 33 +; SI-NEXT: v_readlane_b32 s8, v23, 54 +; SI-NEXT: v_readlane_b32 s9, v23, 55 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s10, v23, 34 +; SI-NEXT: v_readlane_b32 s10, v23, 56 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_lshl_b32 s9, s10, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff @@ -8367,14 +8348,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v23, 47 -; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_lshl_b32 s7, s73, 8 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_readlane_b32 s7, v23, 46 -; SI-NEXT: s_and_b32 s7, s7, 0xff -; SI-NEXT: v_readlane_b32 s8, v23, 45 +; SI-NEXT: s_and_b32 s7, s63, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_lshl_b32 s8, s59, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 @@ -8382,15 +8360,15 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_readlane_b32 s6, v23, 36 +; SI-NEXT: v_readlane_b32 s6, v23, 58 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: v_readlane_b32 s7, v23, 37 +; SI-NEXT: v_readlane_b32 s7, v23, 59 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: v_readlane_b32 s6, v23, 38 -; SI-NEXT: v_readlane_b32 s7, v23, 39 +; SI-NEXT: v_readlane_b32 s6, v23, 60 +; SI-NEXT: v_readlane_b32 s7, v23, 61 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s8, v23, 40 +; SI-NEXT: v_readlane_b32 s8, v23, 62 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s8, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -8401,28 +8379,30 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: v_readlane_b32 s5, v23, 44 -; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_lshl_b32 s5, s57, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v23, 43 -; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_readlane_b32 s6, v23, 42 +; SI-NEXT: s_and_b32 s5, s47, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_lshl_b32 s6, s51, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s29, v23, 1 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s21, v23, 5 -; SI-NEXT: v_readlane_b32 s19, v23, 11 -; SI-NEXT: v_readlane_b32 s17, v23, 17 -; SI-NEXT: v_readlane_b32 s15, v23, 23 -; SI-NEXT: v_readlane_b32 s13, v23, 29 -; SI-NEXT: v_readlane_b32 s11, v23, 35 -; SI-NEXT: v_readlane_b32 s9, v23, 41 +; SI-NEXT: v_readlane_b32 s29, v23, 3 +; SI-NEXT: v_readlane_b32 s27, v23, 9 +; SI-NEXT: v_readlane_b32 s25, v23, 15 +; SI-NEXT: v_readlane_b32 s23, v23, 21 +; SI-NEXT: v_readlane_b32 s21, v23, 27 +; SI-NEXT: v_readlane_b32 s19, v23, 33 +; SI-NEXT: v_readlane_b32 s17, v23, 39 +; SI-NEXT: v_readlane_b32 s15, v23, 45 +; SI-NEXT: v_readlane_b32 s13, v23, 51 +; SI-NEXT: v_readlane_b32 s11, v23, 57 +; SI-NEXT: v_readlane_b32 s9, v23, 63 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s99, v21, 35 ; SI-NEXT: v_readlane_b32 s98, v21, 34 @@ -8468,185 +8448,172 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v23, s50, 0 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 1 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 2 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 3 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 4 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 5 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 6 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 7 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 8 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 9 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 10 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 11 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 12 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 13 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 14 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 15 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 16 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 17 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 18 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 19 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 20 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 21 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 22 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 23 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 24 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 25 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 27 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 28 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 29 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 30 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 31 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 32 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 33 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 34 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 35 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 36 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 37 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 38 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s51, 39 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 40 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v23, s26, 0 +; SI-NEXT: v_writelane_b32 v23, s27, 1 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 2 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 3 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 4 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 5 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 6 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 7 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 9 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 10 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 11 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 12 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 13 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 14 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 15 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 16 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 17 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 18 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 19 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 20 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 21 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 22 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 23 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 24 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 25 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 26 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 28 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 29 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 30 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 31 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 32 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 33 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 34 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 35 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 36 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 37 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 38 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 39 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 40 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 41 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 42 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 43 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 44 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 45 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 46 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 47 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 48 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 49 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 50 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 51 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 52 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 53 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 54 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 55 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 56 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 57 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 58 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 59 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 60 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 61 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 62 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 63 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr81 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; kill: killed $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr99 ; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr97 ; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr87 ; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr85 ; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: v_writelane_b32 v23, s51, 41 -; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v32i32_to_v128i8_scalar: @@ -8737,578 +8704,750 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s5, v18 ; VI-NEXT: v_writelane_b32 v21, s87, 31 ; VI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; VI-NEXT: s_cbranch_scc0 .LBB13_4 +; VI-NEXT: s_cbranch_scc0 .LBB13_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b32 s26, s5, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 8 -; VI-NEXT: s_lshr_b32 s26, s5, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 9 -; VI-NEXT: s_lshr_b32 s26, s5, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 10 -; VI-NEXT: s_lshr_b32 s26, s4, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 11 -; VI-NEXT: s_lshr_b32 s26, s4, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 12 -; VI-NEXT: s_lshr_b32 s26, s7, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 13 -; VI-NEXT: s_lshr_b32 s26, s7, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 14 -; VI-NEXT: s_lshr_b32 s26, s7, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 15 -; VI-NEXT: s_lshr_b32 s26, s6, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 16 -; VI-NEXT: s_lshr_b32 s26, s6, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 17 -; VI-NEXT: s_lshr_b32 s26, s9, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 18 -; VI-NEXT: s_lshr_b32 s26, s9, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 19 -; VI-NEXT: s_lshr_b32 s26, s9, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 20 -; VI-NEXT: s_lshr_b32 s26, s8, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 21 -; VI-NEXT: s_lshr_b32 s26, s8, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 22 -; VI-NEXT: s_lshr_b32 s26, s11, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 23 -; VI-NEXT: s_lshr_b32 s26, s11, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 24 -; VI-NEXT: s_lshr_b32 s26, s11, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 25 -; VI-NEXT: s_lshr_b32 s26, s10, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 26 -; VI-NEXT: s_lshr_b32 s26, s10, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 27 -; VI-NEXT: s_lshr_b32 s26, s13, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 28 -; VI-NEXT: s_lshr_b32 s26, s13, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 29 -; VI-NEXT: s_lshr_b32 s26, s13, 8 +; VI-NEXT: s_lshr_b32 s26, s5, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 30 -; VI-NEXT: s_lshr_b32 s26, s12, 16 +; VI-NEXT: s_lshr_b32 s26, s5, 8 ; VI-NEXT: v_writelane_b32 v22, s26, 31 -; VI-NEXT: s_lshr_b32 s26, s12, 8 +; VI-NEXT: s_lshr_b32 s26, s4, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 32 -; VI-NEXT: s_lshr_b32 s26, s15, 24 +; VI-NEXT: s_lshr_b32 s26, s4, 8 ; VI-NEXT: v_writelane_b32 v22, s26, 33 -; VI-NEXT: s_lshr_b32 s26, s15, 16 +; VI-NEXT: s_lshr_b32 s26, s10, 8 ; VI-NEXT: v_writelane_b32 v22, s26, 34 -; VI-NEXT: s_lshr_b32 s26, s15, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 35 -; VI-NEXT: s_lshr_b32 s26, s14, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 36 -; VI-NEXT: s_lshr_b32 s26, s14, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 37 ; VI-NEXT: s_lshr_b32 s26, s17, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 38 -; VI-NEXT: s_lshr_b32 s26, s17, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 39 -; VI-NEXT: s_lshr_b32 s26, s17, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 40 -; VI-NEXT: s_lshr_b32 s26, s16, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 41 -; VI-NEXT: s_lshr_b32 s26, s16, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 42 -; VI-NEXT: s_lshr_b32 s26, s19, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 43 -; VI-NEXT: s_lshr_b32 s26, s19, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 44 -; VI-NEXT: s_lshr_b32 s26, s19, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 45 -; VI-NEXT: s_lshr_b32 s26, s18, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 46 -; VI-NEXT: s_lshr_b32 s26, s18, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 47 -; VI-NEXT: s_lshr_b32 s26, s21, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 48 -; VI-NEXT: s_lshr_b32 s26, s21, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 49 -; VI-NEXT: s_lshr_b32 s26, s21, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 50 -; VI-NEXT: s_lshr_b32 s26, s20, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 51 -; VI-NEXT: s_lshr_b32 s26, s20, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 52 -; VI-NEXT: s_lshr_b32 s26, s23, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 53 -; VI-NEXT: s_lshr_b32 s26, s23, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 54 -; VI-NEXT: s_lshr_b32 s26, s23, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 55 -; VI-NEXT: s_lshr_b32 s26, s22, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 56 -; VI-NEXT: s_lshr_b32 s26, s22, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 57 -; VI-NEXT: s_lshr_b32 s26, s25, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 58 -; VI-NEXT: s_lshr_b32 s26, s25, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 59 -; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 -; VI-NEXT: v_writelane_b32 v22, s60, 6 -; VI-NEXT: v_writelane_b32 v22, s61, 7 -; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 -; VI-NEXT: v_writelane_b32 v22, s60, 4 -; VI-NEXT: v_writelane_b32 v22, s61, 5 -; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 -; VI-NEXT: v_writelane_b32 v22, s60, 2 -; VI-NEXT: v_writelane_b32 v22, s61, 3 -; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; VI-NEXT: v_writelane_b32 v22, s60, 0 -; VI-NEXT: s_lshr_b32 s66, s25, 8 -; VI-NEXT: s_lshr_b32 s67, s24, 16 -; VI-NEXT: s_lshr_b32 s68, s24, 8 -; VI-NEXT: s_lshr_b32 s69, s41, 24 -; VI-NEXT: s_lshr_b32 s70, s41, 16 -; VI-NEXT: s_lshr_b32 s71, s41, 8 -; VI-NEXT: s_lshr_b32 s80, s40, 16 -; VI-NEXT: s_lshr_b32 s81, s40, 8 -; VI-NEXT: s_lshr_b32 s82, s43, 24 -; VI-NEXT: s_lshr_b32 s83, s43, 16 -; VI-NEXT: s_lshr_b32 s84, s43, 8 -; VI-NEXT: s_lshr_b32 s85, s42, 16 -; VI-NEXT: s_lshr_b32 s86, s42, 8 -; VI-NEXT: s_lshr_b32 s87, s45, 24 -; VI-NEXT: s_lshr_b32 s50, s45, 16 -; VI-NEXT: s_lshr_b32 s26, s45, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 35 +; VI-NEXT: s_lshr_b32 s27, s24, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 36 +; VI-NEXT: s_lshr_b32 s27, s41, 24 +; VI-NEXT: v_writelane_b32 v22, s27, 37 +; VI-NEXT: s_lshr_b32 s27, s41, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 38 +; VI-NEXT: s_lshr_b32 s27, s41, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 39 +; VI-NEXT: s_lshr_b32 s27, s40, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 40 +; VI-NEXT: s_lshr_b32 s27, s40, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 41 +; VI-NEXT: s_lshr_b32 s27, s43, 24 +; VI-NEXT: v_writelane_b32 v22, s27, 42 +; VI-NEXT: s_lshr_b32 s27, s43, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 43 +; VI-NEXT: s_lshr_b32 s27, s43, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 44 +; VI-NEXT: s_lshr_b32 s27, s42, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 45 +; VI-NEXT: s_lshr_b32 s27, s42, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 46 +; VI-NEXT: s_lshr_b32 s27, s45, 24 +; VI-NEXT: v_writelane_b32 v22, s27, 47 +; VI-NEXT: s_lshr_b32 s27, s45, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 48 +; VI-NEXT: s_lshr_b32 s27, s45, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 49 ; VI-NEXT: s_lshr_b32 s27, s44, 16 -; VI-NEXT: s_lshr_b32 s28, s44, 8 -; VI-NEXT: s_lshr_b32 s29, s47, 24 -; VI-NEXT: s_lshr_b32 s51, s47, 16 -; VI-NEXT: s_lshr_b32 s52, s47, 8 -; VI-NEXT: s_lshr_b32 s53, s46, 16 -; VI-NEXT: s_lshr_b32 s54, s46, 8 -; VI-NEXT: s_lshr_b32 s58, s57, 24 -; VI-NEXT: s_lshr_b32 s59, s57, 16 -; VI-NEXT: s_lshr_b32 s55, s57, 8 -; VI-NEXT: s_lshr_b32 s64, s56, 16 -; VI-NEXT: s_lshr_b32 s65, s56, 8 -; VI-NEXT: v_writelane_b32 v22, s61, 1 -; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; VI-NEXT: v_writelane_b32 v22, s27, 50 +; VI-NEXT: s_lshr_b32 s27, s44, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 51 +; VI-NEXT: s_lshr_b32 s27, s47, 24 +; VI-NEXT: v_writelane_b32 v22, s27, 52 +; VI-NEXT: s_lshr_b32 s27, s47, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 53 +; VI-NEXT: s_lshr_b32 s27, s47, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 54 +; VI-NEXT: s_lshr_b32 s27, s46, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 55 +; VI-NEXT: s_lshr_b32 s27, s46, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 56 +; VI-NEXT: s_lshr_b32 s79, s56, 16 +; VI-NEXT: v_writelane_b32 v22, s79, 28 +; VI-NEXT: s_lshr_b64 s[90:91], s[4:5], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 26 +; VI-NEXT: v_writelane_b32 v22, s91, 27 +; VI-NEXT: s_lshr_b64 s[90:91], s[6:7], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 24 +; VI-NEXT: v_writelane_b32 v22, s91, 25 +; VI-NEXT: s_lshr_b64 s[90:91], s[8:9], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 22 +; VI-NEXT: v_writelane_b32 v22, s91, 23 +; VI-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 20 +; VI-NEXT: v_writelane_b32 v22, s91, 21 +; VI-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 18 +; VI-NEXT: v_writelane_b32 v22, s91, 19 +; VI-NEXT: s_lshr_b64 s[90:91], s[14:15], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 16 +; VI-NEXT: v_writelane_b32 v22, s91, 17 +; VI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 14 +; VI-NEXT: v_writelane_b32 v22, s91, 15 +; VI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 12 +; VI-NEXT: v_writelane_b32 v22, s91, 13 +; VI-NEXT: s_lshr_b64 s[90:91], s[20:21], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 10 +; VI-NEXT: v_writelane_b32 v22, s91, 11 +; VI-NEXT: s_lshr_b64 s[90:91], s[22:23], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 8 +; VI-NEXT: v_writelane_b32 v22, s91, 9 ; VI-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[46:47], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[56:57], 24 -; VI-NEXT: s_cbranch_execnz .LBB13_3 -; VI-NEXT: .LBB13_2: ; %cmp.true -; VI-NEXT: s_add_i32 s5, s5, 3 -; VI-NEXT: s_lshr_b32 s26, s5, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 8 -; VI-NEXT: s_lshr_b32 s26, s5, 16 -; VI-NEXT: s_add_i32 s4, s4, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 9 -; VI-NEXT: s_lshr_b32 s26, s5, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 10 -; VI-NEXT: s_lshr_b32 s26, s4, 16 -; VI-NEXT: s_add_i32 s7, s7, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 11 -; VI-NEXT: s_lshr_b32 s26, s4, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 12 -; VI-NEXT: s_lshr_b32 s26, s7, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 13 -; VI-NEXT: s_lshr_b32 s26, s7, 16 -; VI-NEXT: s_add_i32 s6, s6, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 14 -; VI-NEXT: s_lshr_b32 s26, s7, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 15 -; VI-NEXT: s_lshr_b32 s26, s6, 16 -; VI-NEXT: s_add_i32 s9, s9, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 16 -; VI-NEXT: s_lshr_b32 s26, s6, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 17 -; VI-NEXT: s_lshr_b32 s26, s9, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 18 -; VI-NEXT: s_lshr_b32 s26, s9, 16 -; VI-NEXT: s_add_i32 s8, s8, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 19 -; VI-NEXT: s_lshr_b32 s26, s9, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 20 -; VI-NEXT: s_lshr_b32 s26, s8, 16 -; VI-NEXT: s_add_i32 s11, s11, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 21 -; VI-NEXT: s_lshr_b32 s26, s8, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 22 -; VI-NEXT: s_lshr_b32 s26, s11, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 23 -; VI-NEXT: s_lshr_b32 s26, s11, 16 -; VI-NEXT: s_add_i32 s10, s10, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 24 -; VI-NEXT: s_lshr_b32 s26, s11, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 25 -; VI-NEXT: s_lshr_b32 s26, s10, 16 -; VI-NEXT: s_add_i32 s13, s13, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 26 -; VI-NEXT: s_lshr_b32 s26, s10, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 27 -; VI-NEXT: s_lshr_b32 s26, s13, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 28 -; VI-NEXT: s_lshr_b32 s26, s13, 16 -; VI-NEXT: s_add_i32 s12, s12, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 29 -; VI-NEXT: s_lshr_b32 s26, s13, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 30 -; VI-NEXT: s_lshr_b32 s26, s12, 16 -; VI-NEXT: s_add_i32 s15, s15, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 31 -; VI-NEXT: s_lshr_b32 s26, s12, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 32 -; VI-NEXT: s_lshr_b32 s26, s15, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 33 -; VI-NEXT: s_lshr_b32 s26, s15, 16 -; VI-NEXT: s_add_i32 s14, s14, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 34 -; VI-NEXT: s_lshr_b32 s26, s15, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 35 -; VI-NEXT: s_lshr_b32 s26, s14, 16 -; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 36 -; VI-NEXT: s_lshr_b32 s26, s14, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 37 -; VI-NEXT: s_lshr_b32 s26, s17, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 38 -; VI-NEXT: s_lshr_b32 s26, s17, 16 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 39 -; VI-NEXT: s_lshr_b32 s26, s17, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 40 -; VI-NEXT: s_lshr_b32 s26, s16, 16 -; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 41 -; VI-NEXT: s_lshr_b32 s26, s16, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 42 -; VI-NEXT: s_lshr_b32 s26, s19, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 43 -; VI-NEXT: s_lshr_b32 s26, s19, 16 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 44 -; VI-NEXT: s_lshr_b32 s26, s19, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 45 -; VI-NEXT: s_lshr_b32 s26, s18, 16 -; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 46 -; VI-NEXT: s_lshr_b32 s26, s18, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 47 -; VI-NEXT: s_lshr_b32 s26, s21, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 48 -; VI-NEXT: s_lshr_b32 s26, s21, 16 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 49 -; VI-NEXT: s_lshr_b32 s26, s21, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 50 -; VI-NEXT: s_lshr_b32 s26, s20, 16 -; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 51 -; VI-NEXT: s_lshr_b32 s26, s20, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 52 -; VI-NEXT: s_lshr_b32 s26, s23, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 53 -; VI-NEXT: s_lshr_b32 s26, s23, 16 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 54 -; VI-NEXT: s_lshr_b32 s26, s23, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 55 -; VI-NEXT: s_lshr_b32 s26, s22, 16 +; VI-NEXT: v_writelane_b32 v22, s90, 6 +; VI-NEXT: v_writelane_b32 v22, s91, 7 +; VI-NEXT: s_lshr_b64 s[90:91], s[40:41], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 4 +; VI-NEXT: v_writelane_b32 v22, s91, 5 +; VI-NEXT: s_lshr_b64 s[90:91], s[42:43], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 2 +; VI-NEXT: v_writelane_b32 v22, s91, 3 +; VI-NEXT: s_lshr_b64 s[90:91], s[44:45], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 0 +; VI-NEXT: s_lshr_b32 s48, s7, 24 +; VI-NEXT: s_lshr_b32 s82, s7, 8 +; VI-NEXT: s_lshr_b32 s84, s9, 24 +; VI-NEXT: s_lshr_b32 s88, s9, 8 +; VI-NEXT: v_writelane_b32 v22, s91, 1 +; VI-NEXT: s_lshr_b64 s[90:91], s[46:47], 24 +; VI-NEXT: s_lshr_b32 s49, s7, 16 +; VI-NEXT: s_lshr_b32 s83, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s6, 8 +; VI-NEXT: s_lshr_b32 s85, s9, 16 +; VI-NEXT: s_lshr_b32 s36, s8, 16 +; VI-NEXT: s_lshr_b32 s28, s8, 8 +; VI-NEXT: s_lshr_b32 s86, s11, 24 +; VI-NEXT: s_lshr_b32 s87, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s11, 8 +; VI-NEXT: s_lshr_b32 s59, s10, 16 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s50, s13, 8 +; VI-NEXT: s_lshr_b32 s51, s12, 16 +; VI-NEXT: s_lshr_b32 s52, s12, 8 +; VI-NEXT: s_lshr_b32 s34, s15, 24 +; VI-NEXT: s_lshr_b32 s35, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s15, 8 +; VI-NEXT: s_lshr_b32 s61, s14, 16 +; VI-NEXT: s_lshr_b32 s53, s14, 8 +; VI-NEXT: s_lshr_b32 s37, s17, 16 +; VI-NEXT: s_lshr_b32 s54, s17, 8 +; VI-NEXT: s_lshr_b32 s55, s16, 16 +; VI-NEXT: s_lshr_b32 s62, s16, 8 +; VI-NEXT: s_lshr_b32 s38, s19, 24 +; VI-NEXT: s_lshr_b32 s39, s19, 16 +; VI-NEXT: s_lshr_b32 s63, s19, 8 +; VI-NEXT: s_lshr_b32 s64, s18, 16 +; VI-NEXT: s_lshr_b32 s65, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s21, 24 +; VI-NEXT: s_lshr_b32 s73, s21, 16 +; VI-NEXT: s_lshr_b32 s66, s21, 8 +; VI-NEXT: s_lshr_b32 s67, s20, 16 +; VI-NEXT: s_lshr_b32 s68, s20, 8 +; VI-NEXT: s_lshr_b32 s69, s23, 24 +; VI-NEXT: s_lshr_b32 s74, s23, 16 +; VI-NEXT: s_lshr_b32 s75, s23, 8 +; VI-NEXT: s_lshr_b32 s70, s22, 16 +; VI-NEXT: s_lshr_b32 s71, s22, 8 +; VI-NEXT: s_lshr_b32 s80, s25, 24 +; VI-NEXT: s_lshr_b32 s76, s25, 16 +; VI-NEXT: s_lshr_b32 s77, s25, 8 +; VI-NEXT: s_lshr_b32 s26, s24, 16 +; VI-NEXT: s_lshr_b32 s27, s57, 24 +; VI-NEXT: s_lshr_b32 s81, s57, 16 +; VI-NEXT: s_lshr_b32 s78, s57, 8 +; VI-NEXT: s_lshr_b32 s79, s56, 8 +; VI-NEXT: s_mov_b32 s91, s48 +; VI-NEXT: s_mov_b32 s48, s82 +; VI-NEXT: s_mov_b32 s82, s84 +; VI-NEXT: s_mov_b32 s84, s88 +; VI-NEXT: s_lshr_b64 s[88:89], s[56:57], 24 +; VI-NEXT: s_mov_b64 vcc, 0 +; VI-NEXT: s_branch .LBB13_3 +; VI-NEXT: .LBB13_2: +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $vcc_lo +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: v_writelane_b32 v22, s26, 0 +; VI-NEXT: v_writelane_b32 v22, s27, 1 +; VI-NEXT: v_writelane_b32 v22, s28, 2 +; VI-NEXT: v_writelane_b32 v22, s29, 3 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 4 +; VI-NEXT: v_writelane_b32 v22, s29, 5 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 6 +; VI-NEXT: v_writelane_b32 v22, s29, 7 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 8 +; VI-NEXT: v_writelane_b32 v22, s29, 9 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 10 +; VI-NEXT: v_writelane_b32 v22, s29, 11 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 12 +; VI-NEXT: v_writelane_b32 v22, s29, 13 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 14 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: v_writelane_b32 v22, s29, 15 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 16 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: v_writelane_b32 v22, s29, 17 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 18 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: v_writelane_b32 v22, s29, 19 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 20 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: v_writelane_b32 v22, s29, 21 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 22 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 23 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $vcc_lo +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 24 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 25 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $vcc_lo +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 27 +; VI-NEXT: s_mov_b64 vcc, -1 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: .LBB13_3: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, vcc +; VI-NEXT: s_mov_b32 s89, s36 +; VI-NEXT: s_mov_b32 s36, s26 +; VI-NEXT: s_mov_b32 s26, s27 +; VI-NEXT: s_mov_b32 s27, s78 +; VI-NEXT: v_readlane_b32 s78, v22, 28 +; VI-NEXT: s_cbranch_vccnz .LBB13_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: s_add_i32 s5, s5, 3 +; VI-NEXT: s_add_i32 s4, s4, 3 +; VI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 +; VI-NEXT: s_add_i32 s7, s7, 3 +; VI-NEXT: s_add_i32 s6, s6, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 26 +; VI-NEXT: v_writelane_b32 v22, s27, 27 +; VI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; VI-NEXT: s_add_i32 s9, s9, 3 +; VI-NEXT: s_add_i32 s8, s8, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 24 +; VI-NEXT: v_writelane_b32 v22, s27, 25 +; VI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 +; VI-NEXT: s_add_i32 s11, s11, 3 +; VI-NEXT: s_add_i32 s10, s10, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 22 +; VI-NEXT: v_writelane_b32 v22, s27, 23 +; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; VI-NEXT: s_add_i32 s13, s13, 3 +; VI-NEXT: s_add_i32 s12, s12, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 20 +; VI-NEXT: v_writelane_b32 v22, s27, 21 +; VI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; VI-NEXT: s_add_i32 s15, s15, 3 +; VI-NEXT: s_add_i32 s14, s14, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 18 +; VI-NEXT: v_writelane_b32 v22, s27, 19 +; VI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 +; VI-NEXT: s_add_i32 s17, s17, 3 +; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 17 +; VI-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 +; VI-NEXT: s_add_i32 s19, s19, 3 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 14 +; VI-NEXT: v_writelane_b32 v22, s27, 15 +; VI-NEXT: s_lshr_b64 s[26:27], s[18:19], 24 +; VI-NEXT: s_add_i32 s21, s21, 3 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 12 +; VI-NEXT: v_writelane_b32 v22, s27, 13 +; VI-NEXT: s_lshr_b64 s[26:27], s[20:21], 24 +; VI-NEXT: s_add_i32 s23, s23, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 10 +; VI-NEXT: v_writelane_b32 v22, s27, 11 +; VI-NEXT: s_lshr_b64 s[26:27], s[22:23], 24 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_writelane_b32 v22, s26, 56 -; VI-NEXT: s_lshr_b32 s26, s22, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 57 -; VI-NEXT: s_lshr_b32 s26, s25, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 58 -; VI-NEXT: s_lshr_b32 s26, s25, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 59 -; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 -; VI-NEXT: v_writelane_b32 v22, s60, 6 -; VI-NEXT: v_writelane_b32 v22, s61, 7 -; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 -; VI-NEXT: v_writelane_b32 v22, s60, 4 -; VI-NEXT: v_writelane_b32 v22, s61, 5 -; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 -; VI-NEXT: v_writelane_b32 v22, s60, 2 -; VI-NEXT: s_add_i32 s57, s57, 3 -; VI-NEXT: s_add_i32 s56, s56, 3 -; VI-NEXT: s_add_i32 s47, s47, 3 -; VI-NEXT: s_add_i32 s46, s46, 3 -; VI-NEXT: s_add_i32 s45, s45, 3 -; VI-NEXT: s_add_i32 s44, s44, 3 -; VI-NEXT: s_add_i32 s43, s43, 3 -; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 9 +; VI-NEXT: s_lshr_b64 s[26:27], s[24:25], 24 ; VI-NEXT: s_add_i32 s41, s41, 3 ; VI-NEXT: s_add_i32 s40, s40, 3 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_writelane_b32 v22, s61, 3 -; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; VI-NEXT: s_lshr_b32 s66, s25, 8 -; VI-NEXT: s_lshr_b32 s67, s24, 16 -; VI-NEXT: s_lshr_b32 s68, s24, 8 -; VI-NEXT: s_lshr_b32 s69, s41, 24 -; VI-NEXT: s_lshr_b32 s70, s41, 16 -; VI-NEXT: s_lshr_b32 s71, s41, 8 -; VI-NEXT: s_lshr_b32 s80, s40, 16 -; VI-NEXT: s_lshr_b32 s81, s40, 8 -; VI-NEXT: s_lshr_b32 s82, s43, 24 -; VI-NEXT: s_lshr_b32 s83, s43, 16 -; VI-NEXT: s_lshr_b32 s84, s43, 8 -; VI-NEXT: s_lshr_b32 s85, s42, 16 -; VI-NEXT: s_lshr_b32 s86, s42, 8 -; VI-NEXT: s_lshr_b32 s87, s45, 24 -; VI-NEXT: s_lshr_b32 s50, s45, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 6 +; VI-NEXT: v_writelane_b32 v22, s27, 7 +; VI-NEXT: s_lshr_b64 s[26:27], s[40:41], 24 +; VI-NEXT: s_add_i32 s43, s43, 3 +; VI-NEXT: s_add_i32 s42, s42, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 4 +; VI-NEXT: v_writelane_b32 v22, s27, 5 +; VI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 +; VI-NEXT: s_add_i32 s45, s45, 3 +; VI-NEXT: s_add_i32 s44, s44, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 2 +; VI-NEXT: v_writelane_b32 v22, s27, 3 +; VI-NEXT: s_lshr_b64 s[26:27], s[44:45], 24 +; VI-NEXT: v_writelane_b32 v22, s26, 0 +; VI-NEXT: v_writelane_b32 v22, s27, 1 +; VI-NEXT: s_lshr_b32 s26, s5, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 29 +; VI-NEXT: s_lshr_b32 s26, s5, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 30 +; VI-NEXT: s_lshr_b32 s26, s5, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 31 +; VI-NEXT: s_lshr_b32 s26, s4, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 32 +; VI-NEXT: s_lshr_b32 s26, s4, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 33 +; VI-NEXT: s_lshr_b32 s26, s10, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 34 +; VI-NEXT: s_lshr_b32 s26, s17, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 35 +; VI-NEXT: s_lshr_b32 s26, s24, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 36 +; VI-NEXT: s_lshr_b32 s26, s41, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 37 +; VI-NEXT: s_lshr_b32 s26, s41, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 38 +; VI-NEXT: s_lshr_b32 s26, s41, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 39 +; VI-NEXT: s_lshr_b32 s26, s40, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 40 +; VI-NEXT: s_lshr_b32 s26, s40, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 41 +; VI-NEXT: s_lshr_b32 s26, s43, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 42 +; VI-NEXT: s_lshr_b32 s26, s43, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 43 +; VI-NEXT: s_lshr_b32 s26, s43, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 44 +; VI-NEXT: s_lshr_b32 s26, s42, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 45 +; VI-NEXT: s_lshr_b32 s26, s42, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 46 +; VI-NEXT: s_lshr_b32 s26, s45, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 47 +; VI-NEXT: s_lshr_b32 s26, s45, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 48 ; VI-NEXT: s_lshr_b32 s26, s45, 8 -; VI-NEXT: s_lshr_b32 s27, s44, 16 -; VI-NEXT: s_lshr_b32 s28, s44, 8 -; VI-NEXT: s_lshr_b32 s29, s47, 24 -; VI-NEXT: s_lshr_b32 s51, s47, 16 -; VI-NEXT: s_lshr_b32 s52, s47, 8 -; VI-NEXT: s_lshr_b32 s53, s46, 16 -; VI-NEXT: s_lshr_b32 s54, s46, 8 -; VI-NEXT: s_lshr_b32 s58, s57, 24 -; VI-NEXT: s_lshr_b32 s59, s57, 16 -; VI-NEXT: s_lshr_b32 s55, s57, 8 -; VI-NEXT: s_lshr_b32 s64, s56, 16 -; VI-NEXT: s_lshr_b32 s65, s56, 8 -; VI-NEXT: v_writelane_b32 v22, s60, 0 -; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[46:47], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[56:57], 24 -; VI-NEXT: v_writelane_b32 v22, s61, 1 -; VI-NEXT: .LBB13_3: ; %end -; VI-NEXT: s_lshl_b32 s61, s65, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 49 +; VI-NEXT: s_lshr_b32 s26, s44, 16 +; VI-NEXT: s_add_i32 s47, s47, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 50 +; VI-NEXT: s_lshr_b32 s26, s44, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 51 +; VI-NEXT: s_lshr_b32 s26, s47, 24 +; VI-NEXT: v_writelane_b32 v22, s26, 52 +; VI-NEXT: s_lshr_b32 s26, s47, 16 +; VI-NEXT: s_add_i32 s46, s46, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 53 +; VI-NEXT: s_lshr_b32 s26, s47, 8 +; VI-NEXT: s_add_i32 s57, s57, 3 +; VI-NEXT: s_add_i32 s56, s56, 3 +; VI-NEXT: v_writelane_b32 v22, s26, 54 +; VI-NEXT: s_lshr_b32 s26, s46, 16 +; VI-NEXT: s_lshr_b64 s[90:91], s[46:47], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[56:57], 24 +; VI-NEXT: v_writelane_b32 v22, s26, 55 +; VI-NEXT: s_lshr_b32 s26, s46, 8 +; VI-NEXT: s_lshr_b32 s91, s7, 24 +; VI-NEXT: s_lshr_b32 s49, s7, 16 +; VI-NEXT: s_lshr_b32 s48, s7, 8 +; VI-NEXT: s_lshr_b32 s83, s6, 16 +; VI-NEXT: s_lshr_b32 s29, s6, 8 +; VI-NEXT: s_lshr_b32 s82, s9, 24 +; VI-NEXT: s_lshr_b32 s85, s9, 16 +; VI-NEXT: s_lshr_b32 s84, s9, 8 +; VI-NEXT: s_lshr_b32 s89, s8, 16 +; VI-NEXT: s_lshr_b32 s28, s8, 8 +; VI-NEXT: s_lshr_b32 s86, s11, 24 +; VI-NEXT: s_lshr_b32 s87, s11, 16 +; VI-NEXT: s_lshr_b32 s58, s11, 8 +; VI-NEXT: s_lshr_b32 s59, s10, 16 +; VI-NEXT: s_lshr_b32 s30, s13, 24 +; VI-NEXT: s_lshr_b32 s31, s13, 16 +; VI-NEXT: s_lshr_b32 s50, s13, 8 +; VI-NEXT: s_lshr_b32 s51, s12, 16 +; VI-NEXT: s_lshr_b32 s52, s12, 8 +; VI-NEXT: s_lshr_b32 s34, s15, 24 +; VI-NEXT: s_lshr_b32 s35, s15, 16 +; VI-NEXT: s_lshr_b32 s60, s15, 8 +; VI-NEXT: s_lshr_b32 s61, s14, 16 +; VI-NEXT: s_lshr_b32 s53, s14, 8 +; VI-NEXT: s_lshr_b32 s37, s17, 16 +; VI-NEXT: s_lshr_b32 s54, s17, 8 +; VI-NEXT: s_lshr_b32 s55, s16, 16 +; VI-NEXT: s_lshr_b32 s62, s16, 8 +; VI-NEXT: s_lshr_b32 s38, s19, 24 +; VI-NEXT: s_lshr_b32 s39, s19, 16 +; VI-NEXT: s_lshr_b32 s63, s19, 8 +; VI-NEXT: s_lshr_b32 s64, s18, 16 +; VI-NEXT: s_lshr_b32 s65, s18, 8 +; VI-NEXT: s_lshr_b32 s72, s21, 24 +; VI-NEXT: s_lshr_b32 s73, s21, 16 +; VI-NEXT: s_lshr_b32 s66, s21, 8 +; VI-NEXT: s_lshr_b32 s67, s20, 16 +; VI-NEXT: s_lshr_b32 s68, s20, 8 +; VI-NEXT: s_lshr_b32 s69, s23, 24 +; VI-NEXT: s_lshr_b32 s74, s23, 16 +; VI-NEXT: s_lshr_b32 s75, s23, 8 +; VI-NEXT: s_lshr_b32 s70, s22, 16 +; VI-NEXT: s_lshr_b32 s71, s22, 8 +; VI-NEXT: s_lshr_b32 s80, s25, 24 +; VI-NEXT: s_lshr_b32 s76, s25, 16 +; VI-NEXT: s_lshr_b32 s77, s25, 8 +; VI-NEXT: s_lshr_b32 s36, s24, 16 +; VI-NEXT: v_writelane_b32 v22, s26, 56 +; VI-NEXT: s_lshr_b32 s26, s57, 24 +; VI-NEXT: s_lshr_b32 s81, s57, 16 +; VI-NEXT: s_lshr_b32 s27, s57, 8 +; VI-NEXT: s_lshr_b32 s78, s56, 16 +; VI-NEXT: s_lshr_b32 s79, s56, 8 +; VI-NEXT: .LBB13_5: ; %end ; VI-NEXT: s_and_b32 s56, s56, 0xff -; VI-NEXT: s_or_b32 s56, s56, s61 -; VI-NEXT: s_lshl_b32 s61, s48, 8 -; VI-NEXT: s_and_b32 s63, s64, 0xff -; VI-NEXT: s_or_b32 s61, s63, s61 +; VI-NEXT: s_lshl_b32 s79, s79, 8 +; VI-NEXT: s_or_b32 s56, s56, s79 +; VI-NEXT: s_and_b32 s78, s78, 0xff +; VI-NEXT: s_lshl_b32 s79, s88, 8 +; VI-NEXT: s_or_b32 s78, s78, s79 ; VI-NEXT: s_and_b32 s56, s56, 0xffff -; VI-NEXT: s_lshl_b32 s61, s61, 16 -; VI-NEXT: s_or_b32 s56, s56, s61 +; VI-NEXT: s_lshl_b32 s78, s78, 16 +; VI-NEXT: s_or_b32 s56, s56, s78 ; VI-NEXT: v_mov_b32_e32 v1, s56 ; VI-NEXT: s_and_b32 s56, s57, 0xff -; VI-NEXT: s_lshl_b32 s57, s55, 8 -; VI-NEXT: s_or_b32 s56, s56, s57 -; VI-NEXT: s_and_b32 s57, s59, 0xff -; VI-NEXT: s_lshl_b32 s58, s58, 8 -; VI-NEXT: s_or_b32 s57, s57, s58 -; VI-NEXT: s_and_b32 s56, s56, 0xffff -; VI-NEXT: s_lshl_b32 s57, s57, 16 +; VI-NEXT: s_lshl_b32 s57, s27, 8 +; VI-NEXT: s_and_b32 s27, s81, 0xff +; VI-NEXT: s_lshl_b32 s26, s26, 8 ; VI-NEXT: s_or_b32 s56, s56, s57 -; VI-NEXT: v_mov_b32_e32 v2, s56 -; VI-NEXT: s_lshl_b32 s56, s54, 8 -; VI-NEXT: s_and_b32 s46, s46, 0xff -; VI-NEXT: s_or_b32 s46, s46, s56 -; VI-NEXT: s_lshl_b32 s56, s38, 8 -; VI-NEXT: s_and_b32 s57, s53, 0xff -; VI-NEXT: s_or_b32 s56, s57, s56 -; VI-NEXT: s_and_b32 s46, s46, 0xffff -; VI-NEXT: s_lshl_b32 s56, s56, 16 -; VI-NEXT: s_or_b32 s46, s46, s56 -; VI-NEXT: v_mov_b32_e32 v3, s46 -; VI-NEXT: s_and_b32 s46, s47, 0xff -; VI-NEXT: s_lshl_b32 s47, s52, 8 -; VI-NEXT: s_or_b32 s46, s46, s47 -; VI-NEXT: s_and_b32 s47, s51, 0xff -; VI-NEXT: s_lshl_b32 s29, s29, 8 -; VI-NEXT: s_or_b32 s29, s47, s29 -; VI-NEXT: s_and_b32 s46, s46, 0xffff -; VI-NEXT: s_lshl_b32 s29, s29, 16 -; VI-NEXT: s_or_b32 s29, s46, s29 -; VI-NEXT: v_mov_b32_e32 v4, s29 -; VI-NEXT: s_lshl_b32 s28, s28, 8 -; VI-NEXT: s_and_b32 s29, s44, 0xff -; VI-NEXT: s_or_b32 s28, s29, s28 -; VI-NEXT: s_lshl_b32 s29, s36, 8 +; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: s_and_b32 s27, s56, 0xffff +; VI-NEXT: s_lshl_b32 s26, s26, 16 +; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: v_readlane_b32 s27, v22, 56 +; VI-NEXT: v_mov_b32_e32 v2, s26 +; VI-NEXT: s_and_b32 s26, s46, 0xff +; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 55 ; VI-NEXT: s_and_b32 s27, s27, 0xff -; VI-NEXT: s_or_b32 s27, s27, s29 -; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_lshl_b32 s46, s90, 8 +; VI-NEXT: s_or_b32 s27, s27, s46 +; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_lshl_b32 s27, s27, 16 -; VI-NEXT: s_or_b32 s27, s28, s27 -; VI-NEXT: v_mov_b32_e32 v5, s27 -; VI-NEXT: s_and_b32 s27, s45, 0xff -; VI-NEXT: s_lshl_b32 s26, s26, 8 -; VI-NEXT: s_or_b32 s26, s27, s26 -; VI-NEXT: s_and_b32 s27, s50, 0xff -; VI-NEXT: s_lshl_b32 s28, s87, 8 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 54 +; VI-NEXT: v_mov_b32_e32 v3, s26 +; VI-NEXT: s_and_b32 s26, s47, 0xff +; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 53 +; VI-NEXT: v_readlane_b32 s46, v22, 52 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_lshl_b32 s46, s46, 8 +; VI-NEXT: s_or_b32 s27, s27, s46 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 51 +; VI-NEXT: v_mov_b32_e32 v4, s26 +; VI-NEXT: s_and_b32 s26, s44, 0xff +; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 50 +; VI-NEXT: v_readlane_b32 s46, v22, 0 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_lshl_b32 s44, s46, 8 +; VI-NEXT: s_or_b32 s27, s27, s44 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 49 +; VI-NEXT: v_mov_b32_e32 v5, s26 +; VI-NEXT: s_and_b32 s26, s45, 0xff +; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 48 +; VI-NEXT: v_readlane_b32 s44, v22, 47 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_lshl_b32 s44, s44, 8 +; VI-NEXT: s_or_b32 s27, s27, s44 ; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_lshl_b32 s27, s27, 16 ; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 46 ; VI-NEXT: v_mov_b32_e32 v6, s26 -; VI-NEXT: s_lshl_b32 s26, s86, 8 -; VI-NEXT: s_and_b32 s27, s42, 0xff -; VI-NEXT: s_or_b32 s26, s27, s26 -; VI-NEXT: s_lshl_b32 s27, s34, 8 -; VI-NEXT: s_and_b32 s28, s85, 0xff -; VI-NEXT: s_or_b32 s27, s28, s27 +; VI-NEXT: s_and_b32 s26, s42, 0xff +; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 45 +; VI-NEXT: v_readlane_b32 s44, v22, 2 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_lshl_b32 s42, s44, 8 +; VI-NEXT: s_or_b32 s27, s27, s42 ; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_lshl_b32 s27, s27, 16 ; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 44 ; VI-NEXT: v_mov_b32_e32 v7, s26 ; VI-NEXT: s_and_b32 s26, s43, 0xff -; VI-NEXT: s_lshl_b32 s27, s84, 8 +; VI-NEXT: s_lshl_b32 s27, s27, 8 ; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, s83, 0xff -; VI-NEXT: s_lshl_b32 s28, s82, 8 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: v_readlane_b32 s27, v22, 43 +; VI-NEXT: v_readlane_b32 s42, v22, 42 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_lshl_b32 s42, s42, 8 +; VI-NEXT: s_or_b32 s27, s27, s42 ; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_lshl_b32 s27, s27, 16 ; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 41 ; VI-NEXT: v_mov_b32_e32 v8, s26 -; VI-NEXT: s_lshl_b32 s26, s81, 8 -; VI-NEXT: s_and_b32 s27, s40, 0xff -; VI-NEXT: s_or_b32 s26, s27, s26 -; VI-NEXT: s_lshl_b32 s27, s30, 8 -; VI-NEXT: s_and_b32 s28, s80, 0xff -; VI-NEXT: s_or_b32 s27, s28, s27 +; VI-NEXT: s_and_b32 s26, s40, 0xff +; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 40 +; VI-NEXT: v_readlane_b32 s42, v22, 4 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_lshl_b32 s40, s42, 8 +; VI-NEXT: s_or_b32 s27, s27, s40 ; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_lshl_b32 s27, s27, 16 ; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 39 ; VI-NEXT: v_mov_b32_e32 v9, s26 ; VI-NEXT: s_and_b32 s26, s41, 0xff -; VI-NEXT: s_lshl_b32 s27, s71, 8 +; VI-NEXT: s_lshl_b32 s27, s27, 8 ; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, s70, 0xff -; VI-NEXT: s_lshl_b32 s28, s69, 8 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: v_readlane_b32 s27, v22, 38 +; VI-NEXT: v_readlane_b32 s40, v22, 37 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_lshl_b32 s40, s40, 8 +; VI-NEXT: s_or_b32 s27, s27, s40 ; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_lshl_b32 s27, s27, 16 ; VI-NEXT: s_or_b32 s26, s26, s27 ; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: s_lshl_b32 s26, s68, 8 +; VI-NEXT: v_readlane_b32 s26, v22, 36 ; VI-NEXT: s_and_b32 s24, s24, 0xff +; VI-NEXT: s_lshl_b32 s26, s26, 8 +; VI-NEXT: v_readlane_b32 s40, v22, 6 ; VI-NEXT: s_or_b32 s24, s24, s26 -; VI-NEXT: s_lshl_b32 s26, s90, 8 -; VI-NEXT: s_and_b32 s27, s67, 0xff -; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: s_and_b32 s26, s36, 0xff +; VI-NEXT: s_lshl_b32 s27, s40, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 ; VI-NEXT: s_and_b32 s24, s24, 0xffff ; VI-NEXT: s_lshl_b32 s26, s26, 16 ; VI-NEXT: s_or_b32 s24, s24, s26 ; VI-NEXT: v_mov_b32_e32 v11, s24 ; VI-NEXT: s_and_b32 s24, s25, 0xff -; VI-NEXT: s_lshl_b32 s25, s66, 8 +; VI-NEXT: s_lshl_b32 s25, s77, 8 ; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: v_readlane_b32 s25, v22, 59 -; VI-NEXT: v_readlane_b32 s26, v22, 58 -; VI-NEXT: s_and_b32 s25, s25, 0xff -; VI-NEXT: s_lshl_b32 s26, s26, 8 +; VI-NEXT: s_and_b32 s25, s76, 0xff +; VI-NEXT: s_lshl_b32 s26, s80, 8 ; VI-NEXT: s_or_b32 s25, s25, s26 ; VI-NEXT: s_and_b32 s24, s24, 0xffff ; VI-NEXT: s_lshl_b32 s25, s25, 16 ; VI-NEXT: s_or_b32 s24, s24, s25 ; VI-NEXT: v_mov_b32_e32 v12, s24 -; VI-NEXT: v_readlane_b32 s24, v22, 57 -; VI-NEXT: s_lshl_b32 s24, s24, 8 ; VI-NEXT: s_and_b32 s22, s22, 0xff -; VI-NEXT: v_readlane_b32 s25, v22, 56 -; VI-NEXT: s_or_b32 s22, s22, s24 -; VI-NEXT: s_lshl_b32 s24, s88, 8 -; VI-NEXT: s_and_b32 s25, s25, 0xff -; VI-NEXT: s_or_b32 s24, s25, s24 -; VI-NEXT: s_and_b32 s22, s22, 0xffff -; VI-NEXT: s_lshl_b32 s24, s24, 16 +; VI-NEXT: s_lshl_b32 s24, s71, 8 +; VI-NEXT: v_readlane_b32 s26, v22, 8 ; VI-NEXT: s_or_b32 s22, s22, s24 -; VI-NEXT: v_mov_b32_e32 v13, s22 -; VI-NEXT: s_and_b32 s22, s23, 0xff -; VI-NEXT: v_readlane_b32 s23, v22, 55 -; VI-NEXT: s_lshl_b32 s23, s23, 8 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: v_readlane_b32 s23, v22, 54 -; VI-NEXT: v_readlane_b32 s24, v22, 53 -; VI-NEXT: s_and_b32 s23, s23, 0xff -; VI-NEXT: s_lshl_b32 s24, s24, 8 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s22, s22, 0xffff -; VI-NEXT: s_lshl_b32 s23, s23, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: v_mov_b32_e32 v14, s22 -; VI-NEXT: v_readlane_b32 s22, v22, 52 -; VI-NEXT: s_lshl_b32 s22, s22, 8 -; VI-NEXT: s_and_b32 s20, s20, 0xff -; VI-NEXT: v_readlane_b32 s23, v22, 51 -; VI-NEXT: s_or_b32 s20, s20, s22 -; VI-NEXT: s_lshl_b32 s22, s78, 8 -; VI-NEXT: s_and_b32 s23, s23, 0xff -; VI-NEXT: s_or_b32 s22, s23, s22 -; VI-NEXT: s_and_b32 s20, s20, 0xffff -; VI-NEXT: s_lshl_b32 s22, s22, 16 -; VI-NEXT: s_or_b32 s20, s20, s22 +; VI-NEXT: s_and_b32 s24, s70, 0xff +; VI-NEXT: s_lshl_b32 s25, s26, 8 +; VI-NEXT: s_or_b32 s24, s24, s25 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 -; VI-NEXT: v_mov_b32_e32 v15, s20 -; VI-NEXT: s_and_b32 s20, s21, 0xff -; VI-NEXT: v_readlane_b32 s21, v22, 50 +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_lshl_b32 s24, s24, 16 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 -; VI-NEXT: s_lshl_b32 s21, s21, 8 +; VI-NEXT: s_or_b32 s22, s22, s24 ; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 -; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: s_and_b32 s22, s23, 0xff +; VI-NEXT: s_lshl_b32 s23, s75, 8 ; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; VI-NEXT: v_readlane_b32 s21, v22, 49 -; VI-NEXT: v_readlane_b32 s22, v22, 48 +; VI-NEXT: s_or_b32 s22, s22, s23 ; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; VI-NEXT: s_and_b32 s21, s21, 0xff -; VI-NEXT: s_lshl_b32 s22, s22, 8 +; VI-NEXT: s_and_b32 s23, s74, 0xff +; VI-NEXT: s_lshl_b32 s24, s69, 8 ; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_or_b32 s23, s23, s24 ; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 -; VI-NEXT: s_and_b32 s20, s20, 0xffff -; VI-NEXT: s_lshl_b32 s21, s21, 16 +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_lshl_b32 s23, s23, 16 ; VI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s22, s22, s23 ; VI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; VI-NEXT: v_mov_b32_e32 v2, s20 -; VI-NEXT: v_readlane_b32 s20, v22, 47 +; VI-NEXT: v_mov_b32_e32 v2, s22 +; VI-NEXT: s_and_b32 s20, s20, 0xff +; VI-NEXT: s_lshl_b32 s22, s68, 8 +; VI-NEXT: v_readlane_b32 s24, v22, 10 ; VI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 -; VI-NEXT: s_and_b32 s18, s18, 0xff -; VI-NEXT: s_lshl_b32 s20, s20, 8 +; VI-NEXT: s_or_b32 s20, s20, s22 +; VI-NEXT: s_and_b32 s22, s67, 0xff +; VI-NEXT: s_lshl_b32 s23, s24, 8 ; VI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 -; VI-NEXT: s_or_b32 s18, s18, s20 -; VI-NEXT: v_readlane_b32 s20, v22, 46 +; VI-NEXT: s_or_b32 s22, s22, s23 ; VI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 -; VI-NEXT: s_and_b32 s20, s20, 0xff -; VI-NEXT: s_lshl_b32 s21, s76, 8 +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_lshl_b32 s22, s22, 16 ; VI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s20, s20, s22 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s20 +; VI-NEXT: s_and_b32 s20, s21, 0xff +; VI-NEXT: s_lshl_b32 s21, s66, 8 ; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; VI-NEXT: s_and_b32 s21, s73, 0xff +; VI-NEXT: s_lshl_b32 s22, s72, 8 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_lshl_b32 s21, s21, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s20 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_lshl_b32 s20, s65, 8 +; VI-NEXT: v_readlane_b32 s22, v22, 12 +; VI-NEXT: s_or_b32 s18, s18, s20 +; VI-NEXT: s_and_b32 s20, s64, 0xff +; VI-NEXT: s_lshl_b32 s21, s22, 8 +; VI-NEXT: s_or_b32 s20, s20, s21 ; VI-NEXT: s_and_b32 s18, s18, 0xffff ; VI-NEXT: s_lshl_b32 s20, s20, 16 -; VI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v0 ; VI-NEXT: s_or_b32 s18, s18, s20 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: s_and_b32 s18, s19, 0xff -; VI-NEXT: v_readlane_b32 s19, v22, 45 -; VI-NEXT: s_lshl_b32 s19, s19, 8 +; VI-NEXT: s_lshl_b32 s19, s63, 8 ; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: v_readlane_b32 s19, v22, 44 -; VI-NEXT: v_readlane_b32 s20, v22, 43 -; VI-NEXT: s_and_b32 s19, s19, 0xff -; VI-NEXT: s_lshl_b32 s20, s20, 8 +; VI-NEXT: s_and_b32 s19, s39, 0xff +; VI-NEXT: s_lshl_b32 s20, s38, 8 ; VI-NEXT: s_or_b32 s19, s19, s20 ; VI-NEXT: s_and_b32 s18, s18, 0xffff ; VI-NEXT: s_lshl_b32 s19, s19, 16 @@ -9316,13 +9455,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_or_b32 s18, s18, s19 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_readlane_b32 s18, v22, 42 ; VI-NEXT: s_and_b32 s16, s16, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_lshl_b32 s18, s62, 8 +; VI-NEXT: v_readlane_b32 s20, v22, 14 ; VI-NEXT: s_or_b32 s16, s16, s18 -; VI-NEXT: v_readlane_b32 s18, v22, 41 -; VI-NEXT: s_and_b32 s18, s18, 0xff -; VI-NEXT: s_lshl_b32 s19, s74, 8 +; VI-NEXT: s_and_b32 s18, s55, 0xff +; VI-NEXT: s_lshl_b32 s19, s20, 8 ; VI-NEXT: s_or_b32 s18, s18, s19 ; VI-NEXT: s_and_b32 s16, s16, 0xffff ; VI-NEXT: s_lshl_b32 s18, s18, 16 @@ -9331,12 +9469,10 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s16 ; VI-NEXT: s_and_b32 s16, s17, 0xff -; VI-NEXT: v_readlane_b32 s17, v22, 40 -; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_lshl_b32 s17, s54, 8 +; VI-NEXT: v_readlane_b32 s18, v22, 35 ; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v22, 39 -; VI-NEXT: v_readlane_b32 s18, v22, 38 -; VI-NEXT: s_and_b32 s17, s17, 0xff +; VI-NEXT: s_and_b32 s17, s37, 0xff ; VI-NEXT: s_lshl_b32 s18, s18, 8 ; VI-NEXT: s_or_b32 s17, s17, s18 ; VI-NEXT: s_and_b32 s16, s16, 0xffff @@ -9345,13 +9481,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_readlane_b32 s16, v22, 37 ; VI-NEXT: s_and_b32 s14, s14, 0xff -; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_lshl_b32 s16, s53, 8 +; VI-NEXT: v_readlane_b32 s18, v22, 16 ; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: v_readlane_b32 s16, v22, 36 -; VI-NEXT: s_and_b32 s16, s16, 0xff -; VI-NEXT: s_lshl_b32 s17, s72, 8 +; VI-NEXT: s_and_b32 s16, s61, 0xff +; VI-NEXT: s_lshl_b32 s17, s18, 8 ; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: s_and_b32 s14, s14, 0xffff ; VI-NEXT: s_lshl_b32 s16, s16, 16 @@ -9360,13 +9495,10 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: s_and_b32 s14, s15, 0xff -; VI-NEXT: v_readlane_b32 s15, v22, 35 -; VI-NEXT: s_lshl_b32 s15, s15, 8 +; VI-NEXT: s_lshl_b32 s15, s60, 8 ; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: v_readlane_b32 s15, v22, 34 -; VI-NEXT: v_readlane_b32 s16, v22, 33 -; VI-NEXT: s_and_b32 s15, s15, 0xff -; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s15, s35, 0xff +; VI-NEXT: s_lshl_b32 s16, s34, 8 ; VI-NEXT: s_or_b32 s15, s15, s16 ; VI-NEXT: s_and_b32 s14, s14, 0xffff ; VI-NEXT: s_lshl_b32 s15, s15, 16 @@ -9374,13 +9506,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_or_b32 s14, s14, s15 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_readlane_b32 s14, v22, 32 ; VI-NEXT: s_and_b32 s12, s12, 0xff -; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_lshl_b32 s14, s52, 8 +; VI-NEXT: v_readlane_b32 s16, v22, 18 ; VI-NEXT: s_or_b32 s12, s12, s14 -; VI-NEXT: v_readlane_b32 s14, v22, 31 -; VI-NEXT: s_and_b32 s14, s14, 0xff -; VI-NEXT: s_lshl_b32 s15, s62, 8 +; VI-NEXT: s_and_b32 s14, s51, 0xff +; VI-NEXT: s_lshl_b32 s15, s16, 8 ; VI-NEXT: s_or_b32 s14, s14, s15 ; VI-NEXT: s_and_b32 s12, s12, 0xffff ; VI-NEXT: s_lshl_b32 s14, s14, 16 @@ -9389,13 +9520,10 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s12 ; VI-NEXT: s_and_b32 s12, s13, 0xff -; VI-NEXT: v_readlane_b32 s13, v22, 30 -; VI-NEXT: s_lshl_b32 s13, s13, 8 +; VI-NEXT: s_lshl_b32 s13, s50, 8 ; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: v_readlane_b32 s13, v22, 29 -; VI-NEXT: v_readlane_b32 s14, v22, 28 -; VI-NEXT: s_and_b32 s13, s13, 0xff -; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_and_b32 s13, s31, 0xff +; VI-NEXT: s_lshl_b32 s14, s30, 8 ; VI-NEXT: s_or_b32 s13, s13, s14 ; VI-NEXT: s_and_b32 s12, s12, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 @@ -9403,13 +9531,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s12 -; VI-NEXT: v_readlane_b32 s12, v22, 27 +; VI-NEXT: v_readlane_b32 s12, v22, 34 ; VI-NEXT: s_and_b32 s10, s10, 0xff ; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: v_readlane_b32 s14, v22, 20 ; VI-NEXT: s_or_b32 s10, s10, s12 -; VI-NEXT: v_readlane_b32 s12, v22, 26 -; VI-NEXT: v_readlane_b32 s14, v22, 0 -; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_and_b32 s12, s59, 0xff ; VI-NEXT: s_lshl_b32 s13, s14, 8 ; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: s_and_b32 s10, s10, 0xffff @@ -9419,13 +9546,10 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: s_and_b32 s10, s11, 0xff -; VI-NEXT: v_readlane_b32 s11, v22, 25 -; VI-NEXT: s_lshl_b32 s11, s11, 8 +; VI-NEXT: s_lshl_b32 s11, s58, 8 ; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: v_readlane_b32 s11, v22, 24 -; VI-NEXT: v_readlane_b32 s12, v22, 23 -; VI-NEXT: s_and_b32 s11, s11, 0xff -; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: s_and_b32 s11, s87, 0xff +; VI-NEXT: s_lshl_b32 s12, s86, 8 ; VI-NEXT: s_or_b32 s11, s11, s12 ; VI-NEXT: s_and_b32 s10, s10, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 @@ -9433,13 +9557,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_readlane_b32 s10, v22, 22 ; VI-NEXT: s_and_b32 s8, s8, 0xff -; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_lshl_b32 s10, s28, 8 +; VI-NEXT: v_readlane_b32 s12, v22, 22 ; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: v_readlane_b32 s10, v22, 21 -; VI-NEXT: v_readlane_b32 s12, v22, 2 -; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_and_b32 s10, s89, 0xff ; VI-NEXT: s_lshl_b32 s11, s12, 8 ; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: s_and_b32 s8, s8, 0xffff @@ -9449,13 +9571,10 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: s_and_b32 s8, s9, 0xff -; VI-NEXT: v_readlane_b32 s9, v22, 20 -; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_lshl_b32 s9, s84, 8 ; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: v_readlane_b32 s9, v22, 19 -; VI-NEXT: v_readlane_b32 s10, v22, 18 -; VI-NEXT: s_and_b32 s9, s9, 0xff -; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_and_b32 s9, s85, 0xff +; VI-NEXT: s_lshl_b32 s10, s82, 8 ; VI-NEXT: s_or_b32 s9, s9, s10 ; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 @@ -9463,13 +9582,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_readlane_b32 s8, v22, 17 ; VI-NEXT: s_and_b32 s6, s6, 0xff -; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: v_readlane_b32 s10, v22, 24 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_readlane_b32 s8, v22, 16 -; VI-NEXT: v_readlane_b32 s10, v22, 4 -; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_and_b32 s8, s83, 0xff ; VI-NEXT: s_lshl_b32 s9, s10, 8 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s6, s6, 0xffff @@ -9479,13 +9596,10 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_and_b32 s6, s7, 0xff -; VI-NEXT: v_readlane_b32 s7, v22, 15 -; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_lshl_b32 s7, s48, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 14 -; VI-NEXT: v_readlane_b32 s8, v22, 13 -; VI-NEXT: s_and_b32 s7, s7, 0xff -; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_and_b32 s7, s49, 0xff +; VI-NEXT: s_lshl_b32 s8, s91, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -9493,12 +9607,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_readlane_b32 s6, v22, 12 +; VI-NEXT: v_readlane_b32 s6, v22, 33 ; VI-NEXT: s_and_b32 s4, s4, 0xff ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_readlane_b32 s6, v22, 11 -; VI-NEXT: v_readlane_b32 s8, v22, 6 +; VI-NEXT: v_readlane_b32 s6, v22, 32 +; VI-NEXT: v_readlane_b32 s8, v22, 26 ; VI-NEXT: s_and_b32 s6, s6, 0xff ; VI-NEXT: s_lshl_b32 s7, s8, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 @@ -9509,11 +9623,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_and_b32 s4, s5, 0xff -; VI-NEXT: v_readlane_b32 s5, v22, 10 +; VI-NEXT: v_readlane_b32 s5, v22, 31 ; VI-NEXT: s_lshl_b32 s5, s5, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_readlane_b32 s5, v22, 9 -; VI-NEXT: v_readlane_b32 s6, v22, 8 +; VI-NEXT: v_readlane_b32 s5, v22, 30 +; VI-NEXT: v_readlane_b32 s6, v22, 29 ; VI-NEXT: s_and_b32 s5, s5, 0xff ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 @@ -9524,10 +9638,20 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_readlane_b32 s15, v22, 1 -; VI-NEXT: v_readlane_b32 s13, v22, 3 -; VI-NEXT: v_readlane_b32 s11, v22, 5 -; VI-NEXT: v_readlane_b32 s9, v22, 7 +; VI-NEXT: v_readlane_b32 s47, v22, 1 +; VI-NEXT: v_readlane_b32 s45, v22, 3 +; VI-NEXT: v_readlane_b32 s43, v22, 5 +; VI-NEXT: v_readlane_b32 s41, v22, 7 +; VI-NEXT: v_readlane_b32 s27, v22, 9 +; VI-NEXT: v_readlane_b32 s25, v22, 11 +; VI-NEXT: v_readlane_b32 s23, v22, 13 +; VI-NEXT: v_readlane_b32 s21, v22, 15 +; VI-NEXT: v_readlane_b32 s19, v22, 17 +; VI-NEXT: v_readlane_b32 s17, v22, 19 +; VI-NEXT: v_readlane_b32 s15, v22, 21 +; VI-NEXT: v_readlane_b32 s13, v22, 23 +; VI-NEXT: v_readlane_b32 s11, v22, 25 +; VI-NEXT: v_readlane_b32 s9, v22, 27 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_readlane_b32 s87, v21, 31 ; VI-NEXT: v_readlane_b32 s86, v21, 30 @@ -9567,164 +9691,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB13_4: -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr64 -; VI-NEXT: ; implicit-def: $sgpr55 -; VI-NEXT: ; implicit-def: $sgpr59 -; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr54 -; VI-NEXT: ; implicit-def: $sgpr53 -; VI-NEXT: ; implicit-def: $sgpr52 -; VI-NEXT: ; implicit-def: $sgpr51 -; VI-NEXT: ; implicit-def: $sgpr29 -; VI-NEXT: ; implicit-def: $sgpr28 -; VI-NEXT: ; implicit-def: $sgpr27 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr50 -; VI-NEXT: ; implicit-def: $sgpr87 -; VI-NEXT: ; implicit-def: $sgpr86 -; VI-NEXT: ; implicit-def: $sgpr85 -; VI-NEXT: ; implicit-def: $sgpr84 -; VI-NEXT: ; implicit-def: $sgpr83 -; VI-NEXT: ; implicit-def: $sgpr82 -; VI-NEXT: ; implicit-def: $sgpr81 -; VI-NEXT: ; implicit-def: $sgpr80 -; VI-NEXT: ; implicit-def: $sgpr71 -; VI-NEXT: ; implicit-def: $sgpr70 -; VI-NEXT: ; implicit-def: $sgpr69 -; VI-NEXT: ; implicit-def: $sgpr68 -; VI-NEXT: ; implicit-def: $sgpr67 -; VI-NEXT: ; implicit-def: $sgpr66 -; VI-NEXT: ; implicit-def: $sgpr48 -; VI-NEXT: ; implicit-def: $sgpr38 -; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr30 -; VI-NEXT: ; implicit-def: $sgpr90 -; VI-NEXT: ; implicit-def: $sgpr88 -; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v22, s60, 0 -; VI-NEXT: v_writelane_b32 v22, s61, 1 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v22, s60, 2 -; VI-NEXT: v_writelane_b32 v22, s61, 3 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v22, s60, 4 -; VI-NEXT: v_writelane_b32 v22, s61, 5 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v22, s60, 6 -; VI-NEXT: v_writelane_b32 v22, s61, 7 -; VI-NEXT: s_branch .LBB13_2 ; ; GFX9-LABEL: bitcast_v32i32_to_v128i8_scalar: ; GFX9: ; %bb.0: @@ -9820,276 +9786,250 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane ; GFX9-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s26, s5, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 2 -; GFX9-NEXT: s_lshr_b32 s26, s5, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 3 -; GFX9-NEXT: s_lshr_b32 s26, s5, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 4 -; GFX9-NEXT: s_lshr_b32 s26, s4, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 5 -; GFX9-NEXT: s_lshr_b32 s26, s4, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 6 -; GFX9-NEXT: s_lshr_b32 s26, s7, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 7 -; GFX9-NEXT: s_lshr_b32 s26, s7, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 8 -; GFX9-NEXT: s_lshr_b32 s26, s7, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 9 -; GFX9-NEXT: s_lshr_b32 s26, s6, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 10 -; GFX9-NEXT: s_lshr_b32 s26, s6, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 11 -; GFX9-NEXT: s_lshr_b32 s26, s9, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 12 ; GFX9-NEXT: s_lshr_b32 s26, s9, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 13 -; GFX9-NEXT: s_lshr_b32 s26, s9, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 14 -; GFX9-NEXT: s_lshr_b32 s26, s8, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 15 -; GFX9-NEXT: s_lshr_b32 s26, s8, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 16 -; GFX9-NEXT: s_lshr_b32 s26, s11, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 17 -; GFX9-NEXT: s_lshr_b32 s26, s11, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 18 -; GFX9-NEXT: s_lshr_b32 s26, s11, 8 +; GFX9-NEXT: s_lshr_b32 s26, s9, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 19 -; GFX9-NEXT: s_lshr_b32 s26, s10, 16 +; GFX9-NEXT: s_lshr_b32 s26, s8, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 20 -; GFX9-NEXT: s_lshr_b32 s26, s10, 8 +; GFX9-NEXT: s_lshr_b32 s26, s8, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 21 -; GFX9-NEXT: s_lshr_b32 s26, s13, 24 +; GFX9-NEXT: s_lshr_b32 s26, s11, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 22 -; GFX9-NEXT: s_lshr_b32 s26, s13, 16 +; GFX9-NEXT: s_lshr_b32 s26, s11, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 23 -; GFX9-NEXT: s_lshr_b32 s26, s13, 8 +; GFX9-NEXT: s_lshr_b32 s26, s11, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 24 -; GFX9-NEXT: s_lshr_b32 s26, s12, 16 +; GFX9-NEXT: s_lshr_b32 s26, s10, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 25 -; GFX9-NEXT: s_lshr_b32 s26, s12, 8 +; GFX9-NEXT: s_lshr_b32 s26, s10, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 26 -; GFX9-NEXT: s_lshr_b32 s26, s15, 24 +; GFX9-NEXT: s_lshr_b32 s26, s13, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 27 -; GFX9-NEXT: s_lshr_b32 s26, s15, 16 +; GFX9-NEXT: s_lshr_b32 s26, s13, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 28 -; GFX9-NEXT: s_lshr_b32 s26, s15, 8 +; GFX9-NEXT: s_lshr_b32 s26, s13, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 29 -; GFX9-NEXT: s_lshr_b32 s26, s14, 16 +; GFX9-NEXT: s_lshr_b32 s26, s12, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 30 -; GFX9-NEXT: s_lshr_b32 s26, s14, 8 +; GFX9-NEXT: s_lshr_b32 s26, s12, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 31 -; GFX9-NEXT: s_lshr_b32 s26, s17, 24 +; GFX9-NEXT: s_lshr_b32 s26, s15, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 32 -; GFX9-NEXT: s_lshr_b32 s26, s17, 16 +; GFX9-NEXT: s_lshr_b32 s26, s15, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 33 -; GFX9-NEXT: s_lshr_b32 s26, s17, 8 +; GFX9-NEXT: s_lshr_b32 s26, s15, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 34 -; GFX9-NEXT: s_lshr_b32 s26, s16, 16 +; GFX9-NEXT: s_lshr_b32 s26, s14, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 35 -; GFX9-NEXT: s_lshr_b32 s26, s16, 8 +; GFX9-NEXT: s_lshr_b32 s26, s14, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 36 -; GFX9-NEXT: s_lshr_b32 s26, s19, 24 +; GFX9-NEXT: s_lshr_b32 s26, s17, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 37 -; GFX9-NEXT: s_lshr_b32 s26, s19, 16 +; GFX9-NEXT: s_lshr_b32 s26, s17, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 38 -; GFX9-NEXT: s_lshr_b32 s26, s19, 8 +; GFX9-NEXT: s_lshr_b32 s26, s17, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 39 -; GFX9-NEXT: s_lshr_b32 s26, s18, 16 +; GFX9-NEXT: s_lshr_b32 s26, s16, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 40 -; GFX9-NEXT: s_lshr_b32 s26, s18, 8 +; GFX9-NEXT: s_lshr_b32 s26, s16, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 41 -; GFX9-NEXT: s_lshr_b32 s26, s21, 24 +; GFX9-NEXT: s_lshr_b32 s26, s19, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 42 -; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: s_lshr_b32 s26, s19, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 43 -; GFX9-NEXT: s_lshr_b32 s26, s21, 8 +; GFX9-NEXT: s_lshr_b32 s26, s19, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 44 -; GFX9-NEXT: s_lshr_b32 s26, s20, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 45 -; GFX9-NEXT: s_lshr_b32 s26, s20, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 46 -; GFX9-NEXT: s_lshr_b32 s26, s23, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 47 -; GFX9-NEXT: s_lshr_b32 s26, s23, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 48 -; GFX9-NEXT: s_lshr_b32 s26, s23, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 49 -; GFX9-NEXT: s_lshr_b32 s26, s22, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 50 -; GFX9-NEXT: s_lshr_b64 s[28:29], s[4:5], 24 -; GFX9-NEXT: v_writelane_b32 v22, s28, 0 -; GFX9-NEXT: s_lshr_b32 s82, s22, 8 -; GFX9-NEXT: s_lshr_b32 s83, s25, 24 -; GFX9-NEXT: s_lshr_b32 s81, s25, 16 -; GFX9-NEXT: s_lshr_b32 s84, s25, 8 -; GFX9-NEXT: s_lshr_b32 s85, s24, 16 -; GFX9-NEXT: s_lshr_b32 s86, s24, 8 -; GFX9-NEXT: s_lshr_b32 s87, s41, 24 -; GFX9-NEXT: s_lshr_b32 s96, s41, 16 -; GFX9-NEXT: s_lshr_b32 s97, s41, 8 -; GFX9-NEXT: s_lshr_b32 s98, s40, 16 -; GFX9-NEXT: s_lshr_b32 s99, s40, 8 -; GFX9-NEXT: s_lshr_b32 s38, s43, 24 -; GFX9-NEXT: s_lshr_b32 s39, s43, 16 -; GFX9-NEXT: s_lshr_b32 s48, s43, 8 -; GFX9-NEXT: s_lshr_b32 s49, s42, 16 -; GFX9-NEXT: s_lshr_b32 s50, s42, 8 -; GFX9-NEXT: s_lshr_b32 s51, s45, 24 -; GFX9-NEXT: s_lshr_b32 s52, s45, 16 -; GFX9-NEXT: s_lshr_b32 s53, s45, 8 -; GFX9-NEXT: s_lshr_b32 s54, s44, 16 -; GFX9-NEXT: s_lshr_b32 s55, s44, 8 -; GFX9-NEXT: s_lshr_b32 s64, s47, 24 -; GFX9-NEXT: s_lshr_b32 s65, s47, 16 -; GFX9-NEXT: s_lshr_b32 s66, s47, 8 -; GFX9-NEXT: s_lshr_b32 s67, s46, 16 -; GFX9-NEXT: s_lshr_b32 s68, s46, 8 -; GFX9-NEXT: s_lshr_b32 s69, s57, 24 -; GFX9-NEXT: s_lshr_b32 s70, s57, 16 -; GFX9-NEXT: s_lshr_b32 s71, s57, 8 -; GFX9-NEXT: s_lshr_b32 s80, s56, 16 -; GFX9-NEXT: s_lshr_b32 s26, s56, 8 -; GFX9-NEXT: v_writelane_b32 v22, s29, 1 -; GFX9-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 -; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 -; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 +; GFX9-NEXT: v_writelane_b32 v22, s78, 16 +; GFX9-NEXT: v_writelane_b32 v22, s79, 17 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; GFX9-NEXT: v_writelane_b32 v22, s78, 14 +; GFX9-NEXT: v_writelane_b32 v22, s79, 15 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; GFX9-NEXT: v_writelane_b32 v22, s78, 12 +; GFX9-NEXT: v_writelane_b32 v22, s79, 13 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[10:11], 24 +; GFX9-NEXT: v_writelane_b32 v22, s78, 10 +; GFX9-NEXT: v_writelane_b32 v22, s79, 11 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 +; GFX9-NEXT: v_writelane_b32 v22, s78, 8 +; GFX9-NEXT: v_writelane_b32 v22, s79, 9 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[14:15], 24 +; GFX9-NEXT: v_writelane_b32 v22, s78, 6 +; GFX9-NEXT: v_writelane_b32 v22, s79, 7 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[16:17], 24 +; GFX9-NEXT: v_writelane_b32 v22, s78, 4 +; GFX9-NEXT: s_lshr_b32 s90, s5, 24 +; GFX9-NEXT: v_writelane_b32 v22, s79, 5 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[18:19], 24 ; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; GFX9-NEXT: s_lshr_b32 s92, s5, 16 +; GFX9-NEXT: v_writelane_b32 v22, s78, 2 +; GFX9-NEXT: s_mov_b32 s89, s90 ; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX9-NEXT: s_lshr_b32 s94, s5, 8 +; GFX9-NEXT: v_writelane_b32 v22, s79, 3 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX9-NEXT: s_mov_b32 s91, s92 ; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 +; GFX9-NEXT: s_lshr_b32 vcc_lo, s4, 16 +; GFX9-NEXT: s_lshr_b32 vcc_hi, s4, 8 +; GFX9-NEXT: s_lshr_b32 s36, s7, 24 +; GFX9-NEXT: s_lshr_b32 s77, s7, 8 +; GFX9-NEXT: v_writelane_b32 v22, s78, 0 +; GFX9-NEXT: s_mov_b32 s93, s94 ; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 ; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 ; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 +; GFX9-NEXT: s_lshr_b32 s37, s7, 16 +; GFX9-NEXT: s_lshr_b32 s53, s6, 16 +; GFX9-NEXT: s_lshr_b32 s55, s6, 8 +; GFX9-NEXT: s_lshr_b32 s54, s9, 24 +; GFX9-NEXT: s_lshr_b32 s52, s18, 16 +; GFX9-NEXT: s_lshr_b32 s76, s18, 8 +; GFX9-NEXT: s_lshr_b32 s64, s21, 24 +; GFX9-NEXT: s_lshr_b32 s65, s21, 16 +; GFX9-NEXT: s_lshr_b32 s66, s21, 8 +; GFX9-NEXT: s_lshr_b32 s67, s20, 16 +; GFX9-NEXT: s_lshr_b32 s68, s20, 8 +; GFX9-NEXT: s_lshr_b32 s69, s23, 24 +; GFX9-NEXT: s_lshr_b32 s26, s23, 16 +; GFX9-NEXT: s_lshr_b32 s27, s23, 8 +; GFX9-NEXT: s_lshr_b32 s70, s22, 16 +; GFX9-NEXT: s_lshr_b32 s71, s22, 8 +; GFX9-NEXT: s_lshr_b32 s80, s25, 24 +; GFX9-NEXT: s_lshr_b32 s28, s25, 16 +; GFX9-NEXT: s_lshr_b32 s29, s25, 8 +; GFX9-NEXT: s_lshr_b32 s81, s24, 16 +; GFX9-NEXT: s_lshr_b32 s82, s24, 8 +; GFX9-NEXT: s_lshr_b32 s83, s41, 24 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s41, 8 +; GFX9-NEXT: s_lshr_b32 s84, s40, 16 +; GFX9-NEXT: s_lshr_b32 s85, s40, 8 +; GFX9-NEXT: s_lshr_b32 s86, s43, 24 +; GFX9-NEXT: s_lshr_b32 s60, s43, 16 +; GFX9-NEXT: s_lshr_b32 s61, s43, 8 +; GFX9-NEXT: s_lshr_b32 s87, s42, 16 +; GFX9-NEXT: s_lshr_b32 s96, s42, 8 +; GFX9-NEXT: s_lshr_b32 s97, s45, 24 +; GFX9-NEXT: s_lshr_b32 s62, s45, 16 +; GFX9-NEXT: s_lshr_b32 s63, s45, 8 +; GFX9-NEXT: s_lshr_b32 s98, s44, 16 +; GFX9-NEXT: s_lshr_b32 s99, s44, 8 +; GFX9-NEXT: s_lshr_b32 s38, s47, 24 +; GFX9-NEXT: s_lshr_b32 s72, s47, 16 +; GFX9-NEXT: s_lshr_b32 s73, s47, 8 +; GFX9-NEXT: s_lshr_b32 s39, s46, 16 +; GFX9-NEXT: s_lshr_b32 s48, s46, 8 +; GFX9-NEXT: s_lshr_b32 s49, s57, 24 +; GFX9-NEXT: s_lshr_b32 s74, s57, 16 +; GFX9-NEXT: s_lshr_b32 s75, s57, 8 +; GFX9-NEXT: s_lshr_b32 s50, s56, 16 +; GFX9-NEXT: s_lshr_b32 s51, s56, 8 +; GFX9-NEXT: v_writelane_b32 v22, s79, 1 +; GFX9-NEXT: s_mov_b32 s95, vcc_lo +; GFX9-NEXT: s_mov_b32 s31, vcc_hi +; GFX9-NEXT: s_mov_b32 s35, s36 +; GFX9-NEXT: s_mov_b32 s36, s77 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[56:57], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB13_3 ; GFX9-NEXT: .LBB13_2: ; %cmp.true ; GFX9-NEXT: s_add_i32 s5, s5, 3 -; GFX9-NEXT: s_lshr_b32 s26, s5, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 2 -; GFX9-NEXT: s_lshr_b32 s26, s5, 16 ; GFX9-NEXT: s_add_i32 s4, s4, 3 -; GFX9-NEXT: v_writelane_b32 v22, s26, 3 -; GFX9-NEXT: s_lshr_b32 s26, s5, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 4 -; GFX9-NEXT: s_lshr_b32 s26, s4, 16 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 ; GFX9-NEXT: s_add_i32 s7, s7, 3 -; GFX9-NEXT: v_writelane_b32 v22, s26, 5 -; GFX9-NEXT: s_lshr_b32 s26, s4, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 6 -; GFX9-NEXT: s_lshr_b32 s26, s7, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 7 -; GFX9-NEXT: s_lshr_b32 s26, s7, 16 ; GFX9-NEXT: s_add_i32 s6, s6, 3 -; GFX9-NEXT: v_writelane_b32 v22, s26, 8 -; GFX9-NEXT: s_lshr_b32 s26, s7, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 9 -; GFX9-NEXT: s_lshr_b32 s26, s6, 16 +; GFX9-NEXT: v_writelane_b32 v22, s26, 16 +; GFX9-NEXT: v_writelane_b32 v22, s27, 17 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 ; GFX9-NEXT: s_add_i32 s9, s9, 3 -; GFX9-NEXT: v_writelane_b32 v22, s26, 10 -; GFX9-NEXT: s_lshr_b32 s26, s6, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 11 -; GFX9-NEXT: s_lshr_b32 s26, s9, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 12 -; GFX9-NEXT: s_lshr_b32 s26, s9, 16 ; GFX9-NEXT: s_add_i32 s8, s8, 3 -; GFX9-NEXT: v_writelane_b32 v22, s26, 13 -; GFX9-NEXT: s_lshr_b32 s26, s9, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 14 -; GFX9-NEXT: s_lshr_b32 s26, s8, 16 +; GFX9-NEXT: v_writelane_b32 v22, s27, 15 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 ; GFX9-NEXT: s_add_i32 s11, s11, 3 -; GFX9-NEXT: v_writelane_b32 v22, s26, 15 -; GFX9-NEXT: s_lshr_b32 s26, s8, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 16 -; GFX9-NEXT: s_lshr_b32 s26, s11, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 17 -; GFX9-NEXT: s_lshr_b32 s26, s11, 16 ; GFX9-NEXT: s_add_i32 s10, s10, 3 +; GFX9-NEXT: v_writelane_b32 v22, s26, 12 +; GFX9-NEXT: v_writelane_b32 v22, s27, 13 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: v_writelane_b32 v22, s26, 10 +; GFX9-NEXT: v_writelane_b32 v22, s27, 11 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: v_writelane_b32 v22, s26, 8 +; GFX9-NEXT: v_writelane_b32 v22, s27, 9 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 +; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: v_writelane_b32 v22, s26, 6 +; GFX9-NEXT: v_writelane_b32 v22, s27, 7 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 +; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_writelane_b32 v22, s26, 4 +; GFX9-NEXT: v_writelane_b32 v22, s27, 5 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[18:19], 24 +; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: v_writelane_b32 v22, s26, 2 +; GFX9-NEXT: v_writelane_b32 v22, s27, 3 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[20:21], 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 0 +; GFX9-NEXT: v_writelane_b32 v22, s27, 1 +; GFX9-NEXT: s_lshr_b32 s26, s9, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 18 -; GFX9-NEXT: s_lshr_b32 s26, s11, 8 +; GFX9-NEXT: s_lshr_b32 s26, s9, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 19 -; GFX9-NEXT: s_lshr_b32 s26, s10, 16 -; GFX9-NEXT: s_add_i32 s13, s13, 3 +; GFX9-NEXT: s_lshr_b32 s26, s8, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 20 -; GFX9-NEXT: s_lshr_b32 s26, s10, 8 +; GFX9-NEXT: s_lshr_b32 s26, s8, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 21 -; GFX9-NEXT: s_lshr_b32 s26, s13, 24 +; GFX9-NEXT: s_lshr_b32 s26, s11, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 22 -; GFX9-NEXT: s_lshr_b32 s26, s13, 16 -; GFX9-NEXT: s_add_i32 s12, s12, 3 +; GFX9-NEXT: s_lshr_b32 s26, s11, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 23 -; GFX9-NEXT: s_lshr_b32 s26, s13, 8 +; GFX9-NEXT: s_lshr_b32 s26, s11, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 24 -; GFX9-NEXT: s_lshr_b32 s26, s12, 16 -; GFX9-NEXT: s_add_i32 s15, s15, 3 +; GFX9-NEXT: s_lshr_b32 s26, s10, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 25 -; GFX9-NEXT: s_lshr_b32 s26, s12, 8 +; GFX9-NEXT: s_lshr_b32 s26, s10, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 26 -; GFX9-NEXT: s_lshr_b32 s26, s15, 24 +; GFX9-NEXT: s_lshr_b32 s26, s13, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 27 -; GFX9-NEXT: s_lshr_b32 s26, s15, 16 -; GFX9-NEXT: s_add_i32 s14, s14, 3 +; GFX9-NEXT: s_lshr_b32 s26, s13, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 28 -; GFX9-NEXT: s_lshr_b32 s26, s15, 8 +; GFX9-NEXT: s_lshr_b32 s26, s13, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 29 -; GFX9-NEXT: s_lshr_b32 s26, s14, 16 -; GFX9-NEXT: s_add_i32 s17, s17, 3 +; GFX9-NEXT: s_lshr_b32 s26, s12, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 30 -; GFX9-NEXT: s_lshr_b32 s26, s14, 8 +; GFX9-NEXT: s_lshr_b32 s26, s12, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 31 -; GFX9-NEXT: s_lshr_b32 s26, s17, 24 +; GFX9-NEXT: s_lshr_b32 s26, s15, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 32 -; GFX9-NEXT: s_lshr_b32 s26, s17, 16 -; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_lshr_b32 s26, s15, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 33 -; GFX9-NEXT: s_lshr_b32 s26, s17, 8 +; GFX9-NEXT: s_lshr_b32 s26, s15, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 34 -; GFX9-NEXT: s_lshr_b32 s26, s16, 16 -; GFX9-NEXT: s_add_i32 s19, s19, 3 +; GFX9-NEXT: s_lshr_b32 s26, s14, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 35 -; GFX9-NEXT: s_lshr_b32 s26, s16, 8 +; GFX9-NEXT: s_lshr_b32 s26, s14, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 36 -; GFX9-NEXT: s_lshr_b32 s26, s19, 24 +; GFX9-NEXT: s_lshr_b32 s26, s17, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 37 -; GFX9-NEXT: s_lshr_b32 s26, s19, 16 -; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: s_lshr_b32 s26, s17, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 38 -; GFX9-NEXT: s_lshr_b32 s26, s19, 8 +; GFX9-NEXT: s_lshr_b32 s26, s17, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 39 -; GFX9-NEXT: s_lshr_b32 s26, s18, 16 -; GFX9-NEXT: s_add_i32 s21, s21, 3 +; GFX9-NEXT: s_lshr_b32 s26, s16, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 40 -; GFX9-NEXT: s_lshr_b32 s26, s18, 8 +; GFX9-NEXT: s_lshr_b32 s26, s16, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 41 -; GFX9-NEXT: s_lshr_b32 s26, s21, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 42 -; GFX9-NEXT: s_lshr_b32 s26, s21, 16 -; GFX9-NEXT: s_add_i32 s20, s20, 3 -; GFX9-NEXT: v_writelane_b32 v22, s26, 43 -; GFX9-NEXT: s_lshr_b32 s26, s21, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 44 -; GFX9-NEXT: s_lshr_b32 s26, s20, 16 -; GFX9-NEXT: s_add_i32 s23, s23, 3 -; GFX9-NEXT: v_writelane_b32 v22, s26, 45 -; GFX9-NEXT: s_lshr_b32 s26, s20, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 46 -; GFX9-NEXT: s_lshr_b32 s26, s23, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 47 -; GFX9-NEXT: s_lshr_b32 s26, s23, 16 -; GFX9-NEXT: s_add_i32 s22, s22, 3 -; GFX9-NEXT: v_writelane_b32 v22, s26, 48 -; GFX9-NEXT: s_lshr_b32 s26, s23, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 49 -; GFX9-NEXT: s_lshr_b32 s26, s22, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 50 -; GFX9-NEXT: s_lshr_b64 s[28:29], s[4:5], 24 -; GFX9-NEXT: s_add_i32 s57, s57, 3 -; GFX9-NEXT: s_add_i32 s56, s56, 3 +; GFX9-NEXT: s_lshr_b32 s26, s19, 24 ; GFX9-NEXT: s_add_i32 s47, s47, 3 ; GFX9-NEXT: s_add_i32 s46, s46, 3 ; GFX9-NEXT: s_add_i32 s45, s45, 3 @@ -10100,198 +10040,195 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_add_i32 s40, s40, 3 ; GFX9-NEXT: s_add_i32 s25, s25, 3 ; GFX9-NEXT: s_add_i32 s24, s24, 3 -; GFX9-NEXT: v_writelane_b32 v22, s28, 0 -; GFX9-NEXT: s_lshr_b32 s82, s22, 8 -; GFX9-NEXT: s_lshr_b32 s83, s25, 24 -; GFX9-NEXT: s_lshr_b32 s81, s25, 16 -; GFX9-NEXT: s_lshr_b32 s84, s25, 8 -; GFX9-NEXT: s_lshr_b32 s85, s24, 16 -; GFX9-NEXT: s_lshr_b32 s86, s24, 8 -; GFX9-NEXT: s_lshr_b32 s87, s41, 24 -; GFX9-NEXT: s_lshr_b32 s96, s41, 16 -; GFX9-NEXT: s_lshr_b32 s97, s41, 8 -; GFX9-NEXT: s_lshr_b32 s98, s40, 16 -; GFX9-NEXT: s_lshr_b32 s99, s40, 8 -; GFX9-NEXT: s_lshr_b32 s38, s43, 24 -; GFX9-NEXT: s_lshr_b32 s39, s43, 16 -; GFX9-NEXT: s_lshr_b32 s48, s43, 8 -; GFX9-NEXT: s_lshr_b32 s49, s42, 16 -; GFX9-NEXT: s_lshr_b32 s50, s42, 8 -; GFX9-NEXT: s_lshr_b32 s51, s45, 24 -; GFX9-NEXT: s_lshr_b32 s52, s45, 16 -; GFX9-NEXT: s_lshr_b32 s53, s45, 8 -; GFX9-NEXT: s_lshr_b32 s54, s44, 16 -; GFX9-NEXT: s_lshr_b32 s55, s44, 8 -; GFX9-NEXT: s_lshr_b32 s64, s47, 24 -; GFX9-NEXT: s_lshr_b32 s65, s47, 16 -; GFX9-NEXT: s_lshr_b32 s66, s47, 8 -; GFX9-NEXT: s_lshr_b32 s67, s46, 16 -; GFX9-NEXT: s_lshr_b32 s68, s46, 8 -; GFX9-NEXT: s_lshr_b32 s69, s57, 24 -; GFX9-NEXT: s_lshr_b32 s70, s57, 16 -; GFX9-NEXT: s_lshr_b32 s71, s57, 8 -; GFX9-NEXT: s_lshr_b32 s80, s56, 16 -; GFX9-NEXT: s_lshr_b32 s26, s56, 8 -; GFX9-NEXT: v_writelane_b32 v22, s29, 1 -; GFX9-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 -; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 -; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX9-NEXT: s_add_i32 s23, s23, 3 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: v_writelane_b32 v22, s26, 42 +; GFX9-NEXT: s_lshr_b32 s26, s19, 16 +; GFX9-NEXT: s_add_i32 s57, s57, 3 +; GFX9-NEXT: s_add_i32 s56, s56, 3 ; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 ; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 ; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 ; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 ; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 ; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 43 +; GFX9-NEXT: s_lshr_b32 s26, s19, 8 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[56:57], 24 +; GFX9-NEXT: s_lshr_b32 s89, s5, 24 +; GFX9-NEXT: s_lshr_b32 s91, s5, 16 +; GFX9-NEXT: s_lshr_b32 s93, s5, 8 +; GFX9-NEXT: s_lshr_b32 s95, s4, 16 +; GFX9-NEXT: s_lshr_b32 s31, s4, 8 +; GFX9-NEXT: s_lshr_b32 s35, s7, 24 +; GFX9-NEXT: s_lshr_b32 s37, s7, 16 +; GFX9-NEXT: s_lshr_b32 s36, s7, 8 +; GFX9-NEXT: s_lshr_b32 s53, s6, 16 +; GFX9-NEXT: s_lshr_b32 s55, s6, 8 +; GFX9-NEXT: s_lshr_b32 s54, s9, 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 44 +; GFX9-NEXT: s_lshr_b32 s52, s18, 16 +; GFX9-NEXT: s_lshr_b32 s76, s18, 8 +; GFX9-NEXT: s_lshr_b32 s64, s21, 24 +; GFX9-NEXT: s_lshr_b32 s65, s21, 16 +; GFX9-NEXT: s_lshr_b32 s66, s21, 8 +; GFX9-NEXT: s_lshr_b32 s67, s20, 16 +; GFX9-NEXT: s_lshr_b32 s68, s20, 8 +; GFX9-NEXT: s_lshr_b32 s69, s23, 24 +; GFX9-NEXT: s_lshr_b32 s26, s23, 16 +; GFX9-NEXT: s_lshr_b32 s27, s23, 8 +; GFX9-NEXT: s_lshr_b32 s70, s22, 16 +; GFX9-NEXT: s_lshr_b32 s71, s22, 8 +; GFX9-NEXT: s_lshr_b32 s80, s25, 24 +; GFX9-NEXT: s_lshr_b32 s28, s25, 16 +; GFX9-NEXT: s_lshr_b32 s29, s25, 8 +; GFX9-NEXT: s_lshr_b32 s81, s24, 16 +; GFX9-NEXT: s_lshr_b32 s82, s24, 8 +; GFX9-NEXT: s_lshr_b32 s83, s41, 24 +; GFX9-NEXT: s_lshr_b32 s58, s41, 16 +; GFX9-NEXT: s_lshr_b32 s59, s41, 8 +; GFX9-NEXT: s_lshr_b32 s84, s40, 16 +; GFX9-NEXT: s_lshr_b32 s85, s40, 8 +; GFX9-NEXT: s_lshr_b32 s86, s43, 24 +; GFX9-NEXT: s_lshr_b32 s60, s43, 16 +; GFX9-NEXT: s_lshr_b32 s61, s43, 8 +; GFX9-NEXT: s_lshr_b32 s87, s42, 16 +; GFX9-NEXT: s_lshr_b32 s96, s42, 8 +; GFX9-NEXT: s_lshr_b32 s97, s45, 24 +; GFX9-NEXT: s_lshr_b32 s62, s45, 16 +; GFX9-NEXT: s_lshr_b32 s63, s45, 8 +; GFX9-NEXT: s_lshr_b32 s98, s44, 16 +; GFX9-NEXT: s_lshr_b32 s99, s44, 8 +; GFX9-NEXT: s_lshr_b32 s38, s47, 24 +; GFX9-NEXT: s_lshr_b32 s72, s47, 16 +; GFX9-NEXT: s_lshr_b32 s73, s47, 8 +; GFX9-NEXT: s_lshr_b32 s39, s46, 16 +; GFX9-NEXT: s_lshr_b32 s48, s46, 8 +; GFX9-NEXT: s_lshr_b32 s49, s57, 24 +; GFX9-NEXT: s_lshr_b32 s74, s57, 16 +; GFX9-NEXT: s_lshr_b32 s75, s57, 8 +; GFX9-NEXT: s_lshr_b32 s50, s56, 16 +; GFX9-NEXT: s_lshr_b32 s51, s56, 8 ; GFX9-NEXT: .LBB13_3: ; %end -; GFX9-NEXT: s_lshl_b32 s26, s26, 8 -; GFX9-NEXT: s_and_b32 s27, s56, 0xff -; GFX9-NEXT: s_or_b32 s26, s27, s26 -; GFX9-NEXT: s_lshl_b32 s27, s36, 8 -; GFX9-NEXT: s_and_b32 s29, s80, 0xff -; GFX9-NEXT: s_or_b32 s27, s29, s27 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v1, s26 -; GFX9-NEXT: s_and_b32 s26, s57, 0xff -; GFX9-NEXT: s_lshl_b32 s27, s71, 8 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: s_and_b32 s27, s70, 0xff -; GFX9-NEXT: s_lshl_b32 s29, s69, 8 -; GFX9-NEXT: s_or_b32 s27, s27, s29 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s26 -; GFX9-NEXT: s_lshl_b32 s26, s68, 8 -; GFX9-NEXT: s_and_b32 s27, s46, 0xff -; GFX9-NEXT: s_or_b32 s26, s27, s26 -; GFX9-NEXT: s_lshl_b32 s27, s34, 8 -; GFX9-NEXT: s_and_b32 s29, s67, 0xff -; GFX9-NEXT: s_or_b32 s27, s29, s27 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v3, s26 -; GFX9-NEXT: s_and_b32 s26, s47, 0xff -; GFX9-NEXT: s_lshl_b32 s27, s66, 8 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: s_and_b32 s27, s65, 0xff -; GFX9-NEXT: s_lshl_b32 s29, s64, 8 -; GFX9-NEXT: s_or_b32 s27, s27, s29 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v4, s26 -; GFX9-NEXT: s_lshl_b32 s26, s55, 8 -; GFX9-NEXT: s_and_b32 s27, s44, 0xff -; GFX9-NEXT: s_or_b32 s26, s27, s26 -; GFX9-NEXT: s_lshl_b32 s27, s30, 8 -; GFX9-NEXT: s_and_b32 s29, s54, 0xff -; GFX9-NEXT: s_or_b32 s27, s29, s27 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v5, s26 -; GFX9-NEXT: s_and_b32 s26, s45, 0xff -; GFX9-NEXT: s_lshl_b32 s27, s53, 8 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: s_and_b32 s27, s52, 0xff -; GFX9-NEXT: s_lshl_b32 s29, s51, 8 -; GFX9-NEXT: s_or_b32 s27, s27, s29 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v6, s26 -; GFX9-NEXT: s_lshl_b32 s26, s50, 8 -; GFX9-NEXT: s_and_b32 s27, s42, 0xff -; GFX9-NEXT: s_or_b32 s26, s27, s26 -; GFX9-NEXT: s_lshl_b32 s27, s94, 8 -; GFX9-NEXT: s_and_b32 s29, s49, 0xff -; GFX9-NEXT: s_or_b32 s27, s29, s27 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v7, s26 -; GFX9-NEXT: s_and_b32 s26, s43, 0xff -; GFX9-NEXT: s_lshl_b32 s27, s48, 8 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: s_and_b32 s27, s39, 0xff -; GFX9-NEXT: s_lshl_b32 s29, s38, 8 -; GFX9-NEXT: s_or_b32 s27, s27, s29 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v8, s26 -; GFX9-NEXT: s_lshl_b32 s26, s99, 8 -; GFX9-NEXT: s_and_b32 s27, s40, 0xff -; GFX9-NEXT: s_or_b32 s26, s27, s26 -; GFX9-NEXT: s_lshl_b32 s27, s92, 8 -; GFX9-NEXT: s_and_b32 s29, s98, 0xff -; GFX9-NEXT: s_or_b32 s27, s29, s27 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v9, s26 -; GFX9-NEXT: s_and_b32 s26, s41, 0xff -; GFX9-NEXT: s_lshl_b32 s27, s97, 8 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: s_and_b32 s27, s96, 0xff -; GFX9-NEXT: s_lshl_b32 s29, s87, 8 -; GFX9-NEXT: s_or_b32 s27, s27, s29 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: s_lshl_b32 s26, s86, 8 +; GFX9-NEXT: s_and_b32 s56, s56, 0xff +; GFX9-NEXT: s_lshl_b32 s77, s51, 8 +; GFX9-NEXT: s_or_b32 s56, s56, s77 +; GFX9-NEXT: s_and_b32 s77, s50, 0xff +; GFX9-NEXT: s_lshl_b32 s79, s78, 8 +; GFX9-NEXT: s_or_b32 s77, s77, s79 +; GFX9-NEXT: s_and_b32 s56, s56, 0xffff +; GFX9-NEXT: s_lshl_b32 s77, s77, 16 +; GFX9-NEXT: s_or_b32 s56, s56, s77 +; GFX9-NEXT: v_mov_b32_e32 v1, s56 +; GFX9-NEXT: s_and_b32 s56, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s57, s75, 8 +; GFX9-NEXT: s_or_b32 s56, s56, s57 +; GFX9-NEXT: s_and_b32 s57, s74, 0xff +; GFX9-NEXT: s_lshl_b32 s74, s49, 8 +; GFX9-NEXT: s_or_b32 s57, s57, s74 +; GFX9-NEXT: s_and_b32 s56, s56, 0xffff +; GFX9-NEXT: s_lshl_b32 s57, s57, 16 +; GFX9-NEXT: s_or_b32 s56, s56, s57 +; GFX9-NEXT: v_mov_b32_e32 v2, s56 +; GFX9-NEXT: s_and_b32 s46, s46, 0xff +; GFX9-NEXT: s_lshl_b32 s56, s48, 8 +; GFX9-NEXT: s_or_b32 s46, s46, s56 +; GFX9-NEXT: s_and_b32 s56, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s57, s34, 8 +; GFX9-NEXT: s_or_b32 s56, s56, s57 +; GFX9-NEXT: s_and_b32 s46, s46, 0xffff +; GFX9-NEXT: s_lshl_b32 s56, s56, 16 +; GFX9-NEXT: s_or_b32 s46, s46, s56 +; GFX9-NEXT: v_mov_b32_e32 v3, s46 +; GFX9-NEXT: s_and_b32 s46, s47, 0xff +; GFX9-NEXT: s_lshl_b32 s47, s73, 8 +; GFX9-NEXT: s_or_b32 s46, s46, s47 +; GFX9-NEXT: s_and_b32 s47, s72, 0xff +; GFX9-NEXT: s_lshl_b32 s56, s38, 8 +; GFX9-NEXT: s_or_b32 s47, s47, s56 +; GFX9-NEXT: s_and_b32 s46, s46, 0xffff +; GFX9-NEXT: s_lshl_b32 s47, s47, 16 +; GFX9-NEXT: s_or_b32 s46, s46, s47 +; GFX9-NEXT: v_mov_b32_e32 v4, s46 +; GFX9-NEXT: s_and_b32 s44, s44, 0xff +; GFX9-NEXT: s_lshl_b32 s46, s99, 8 +; GFX9-NEXT: s_or_b32 s44, s44, s46 +; GFX9-NEXT: s_and_b32 s46, s98, 0xff +; GFX9-NEXT: s_lshl_b32 s47, s30, 8 +; GFX9-NEXT: s_or_b32 s46, s46, s47 +; GFX9-NEXT: s_and_b32 s44, s44, 0xffff +; GFX9-NEXT: s_lshl_b32 s46, s46, 16 +; GFX9-NEXT: s_or_b32 s44, s44, s46 +; GFX9-NEXT: v_mov_b32_e32 v5, s44 +; GFX9-NEXT: s_and_b32 s44, s45, 0xff +; GFX9-NEXT: s_lshl_b32 s45, s63, 8 +; GFX9-NEXT: s_or_b32 s44, s44, s45 +; GFX9-NEXT: s_and_b32 s45, s62, 0xff +; GFX9-NEXT: s_lshl_b32 s46, s97, 8 +; GFX9-NEXT: s_or_b32 s45, s45, s46 +; GFX9-NEXT: s_and_b32 s44, s44, 0xffff +; GFX9-NEXT: s_lshl_b32 s45, s45, 16 +; GFX9-NEXT: s_or_b32 s44, s44, s45 +; GFX9-NEXT: v_mov_b32_e32 v6, s44 +; GFX9-NEXT: s_and_b32 s42, s42, 0xff +; GFX9-NEXT: s_lshl_b32 s44, s96, 8 +; GFX9-NEXT: s_or_b32 s42, s42, s44 +; GFX9-NEXT: s_and_b32 s44, s87, 0xff +; GFX9-NEXT: s_lshl_b32 s45, s94, 8 +; GFX9-NEXT: s_or_b32 s44, s44, s45 +; GFX9-NEXT: s_and_b32 s42, s42, 0xffff +; GFX9-NEXT: s_lshl_b32 s44, s44, 16 +; GFX9-NEXT: s_or_b32 s42, s42, s44 +; GFX9-NEXT: v_mov_b32_e32 v7, s42 +; GFX9-NEXT: s_and_b32 s42, s43, 0xff +; GFX9-NEXT: s_lshl_b32 s43, s61, 8 +; GFX9-NEXT: s_or_b32 s42, s42, s43 +; GFX9-NEXT: s_and_b32 s43, s60, 0xff +; GFX9-NEXT: s_lshl_b32 s44, s86, 8 +; GFX9-NEXT: s_or_b32 s43, s43, s44 +; GFX9-NEXT: s_and_b32 s42, s42, 0xffff +; GFX9-NEXT: s_lshl_b32 s43, s43, 16 +; GFX9-NEXT: s_or_b32 s42, s42, s43 +; GFX9-NEXT: v_mov_b32_e32 v8, s42 +; GFX9-NEXT: s_and_b32 s40, s40, 0xff +; GFX9-NEXT: s_lshl_b32 s42, s85, 8 +; GFX9-NEXT: s_or_b32 s40, s40, s42 +; GFX9-NEXT: s_and_b32 s42, s84, 0xff +; GFX9-NEXT: s_lshl_b32 s43, s92, 8 +; GFX9-NEXT: s_or_b32 s42, s42, s43 +; GFX9-NEXT: s_and_b32 s40, s40, 0xffff +; GFX9-NEXT: s_lshl_b32 s42, s42, 16 +; GFX9-NEXT: s_or_b32 s40, s40, s42 +; GFX9-NEXT: v_mov_b32_e32 v9, s40 +; GFX9-NEXT: s_and_b32 s40, s41, 0xff +; GFX9-NEXT: s_lshl_b32 s41, s59, 8 +; GFX9-NEXT: s_or_b32 s40, s40, s41 +; GFX9-NEXT: s_and_b32 s41, s58, 0xff +; GFX9-NEXT: s_lshl_b32 s42, s83, 8 +; GFX9-NEXT: s_or_b32 s41, s41, s42 +; GFX9-NEXT: s_and_b32 s40, s40, 0xffff +; GFX9-NEXT: s_lshl_b32 s41, s41, 16 +; GFX9-NEXT: s_or_b32 s40, s40, s41 +; GFX9-NEXT: v_mov_b32_e32 v10, s40 ; GFX9-NEXT: s_and_b32 s24, s24, 0xff -; GFX9-NEXT: s_or_b32 s24, s24, s26 -; GFX9-NEXT: s_lshl_b32 s26, s90, 8 -; GFX9-NEXT: s_and_b32 s27, s85, 0xff -; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: s_lshl_b32 s40, s82, 8 +; GFX9-NEXT: s_or_b32 s24, s24, s40 +; GFX9-NEXT: s_and_b32 s40, s81, 0xff +; GFX9-NEXT: s_lshl_b32 s41, s90, 8 +; GFX9-NEXT: s_or_b32 s40, s40, s41 ; GFX9-NEXT: s_and_b32 s24, s24, 0xffff -; GFX9-NEXT: s_lshl_b32 s26, s26, 16 -; GFX9-NEXT: s_or_b32 s24, s24, s26 +; GFX9-NEXT: s_lshl_b32 s40, s40, 16 +; GFX9-NEXT: s_or_b32 s24, s24, s40 ; GFX9-NEXT: v_mov_b32_e32 v11, s24 ; GFX9-NEXT: s_and_b32 s24, s25, 0xff -; GFX9-NEXT: s_lshl_b32 s25, s84, 8 +; GFX9-NEXT: s_lshl_b32 s25, s29, 8 ; GFX9-NEXT: s_or_b32 s24, s24, s25 -; GFX9-NEXT: s_and_b32 s25, s81, 0xff -; GFX9-NEXT: s_lshl_b32 s26, s83, 8 -; GFX9-NEXT: s_or_b32 s25, s25, s26 +; GFX9-NEXT: s_and_b32 s25, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s28, s80, 8 +; GFX9-NEXT: s_or_b32 s25, s25, s28 ; GFX9-NEXT: s_and_b32 s24, s24, 0xffff ; GFX9-NEXT: s_lshl_b32 s25, s25, 16 ; GFX9-NEXT: s_or_b32 s24, s24, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s24 -; GFX9-NEXT: s_lshl_b32 s24, s82, 8 -; GFX9-NEXT: s_and_b32 s22, s22, 0xff -; GFX9-NEXT: v_readlane_b32 s25, v22, 50 -; GFX9-NEXT: s_or_b32 s22, s22, s24 -; GFX9-NEXT: s_lshl_b32 s24, s88, 8 -; GFX9-NEXT: s_and_b32 s25, s25, 0xff -; GFX9-NEXT: s_or_b32 s24, s25, s24 -; GFX9-NEXT: s_and_b32 s22, s22, 0xffff -; GFX9-NEXT: s_lshl_b32 s24, s24, 16 -; GFX9-NEXT: s_or_b32 s22, s22, s24 -; GFX9-NEXT: v_mov_b32_e32 v13, s22 -; GFX9-NEXT: s_and_b32 s22, s23, 0xff -; GFX9-NEXT: v_readlane_b32 s23, v22, 49 -; GFX9-NEXT: s_lshl_b32 s23, s23, 8 -; GFX9-NEXT: s_or_b32 s22, s22, s23 -; GFX9-NEXT: v_readlane_b32 s23, v22, 48 -; GFX9-NEXT: v_readlane_b32 s24, v22, 47 -; GFX9-NEXT: s_and_b32 s23, s23, 0xff -; GFX9-NEXT: s_lshl_b32 s24, s24, 8 -; GFX9-NEXT: s_or_b32 s23, s23, s24 -; GFX9-NEXT: s_and_b32 s22, s22, 0xffff -; GFX9-NEXT: s_lshl_b32 s23, s23, 16 -; GFX9-NEXT: s_or_b32 s22, s22, s23 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 @@ -10303,16 +10240,35 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-NEXT: s_and_b32 s22, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s24, s71, 8 +; GFX9-NEXT: s_or_b32 s22, s22, s24 +; GFX9-NEXT: s_and_b32 s24, s70, 0xff +; GFX9-NEXT: s_lshl_b32 s25, s88, 8 +; GFX9-NEXT: s_or_b32 s24, s24, s25 +; GFX9-NEXT: s_and_b32 s22, s22, 0xffff +; GFX9-NEXT: s_lshl_b32 s24, s24, 16 +; GFX9-NEXT: s_or_b32 s22, s22, s24 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-NEXT: s_and_b32 s22, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s23, s27, 8 +; GFX9-NEXT: s_or_b32 s22, s22, s23 +; GFX9-NEXT: s_and_b32 s23, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s24, s69, 8 +; GFX9-NEXT: s_or_b32 s23, s23, s24 +; GFX9-NEXT: s_and_b32 s22, s22, 0xffff +; GFX9-NEXT: s_lshl_b32 s23, s23, 16 +; GFX9-NEXT: s_or_b32 s22, s22, s23 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 ; GFX9-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-NEXT: v_readlane_b32 s22, v22, 46 ; GFX9-NEXT: s_and_b32 s20, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s22, s22, 8 +; GFX9-NEXT: s_lshl_b32 s22, s68, 8 +; GFX9-NEXT: v_readlane_b32 s24, v22, 0 ; GFX9-NEXT: s_or_b32 s20, s20, s22 -; GFX9-NEXT: v_readlane_b32 s22, v22, 45 -; GFX9-NEXT: s_and_b32 s22, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s23, s78, 8 +; GFX9-NEXT: s_and_b32 s22, s67, 0xff +; GFX9-NEXT: s_lshl_b32 s23, s24, 8 ; GFX9-NEXT: s_or_b32 s22, s22, s23 ; GFX9-NEXT: s_and_b32 s20, s20, 0xffff ; GFX9-NEXT: s_lshl_b32 s22, s22, 16 @@ -10320,26 +10276,22 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: v_mov_b32_e32 v1, s20 ; GFX9-NEXT: s_and_b32 s20, s21, 0xff -; GFX9-NEXT: v_readlane_b32 s21, v22, 44 -; GFX9-NEXT: s_lshl_b32 s21, s21, 8 +; GFX9-NEXT: s_lshl_b32 s21, s66, 8 ; GFX9-NEXT: s_or_b32 s20, s20, s21 -; GFX9-NEXT: v_readlane_b32 s21, v22, 43 -; GFX9-NEXT: v_readlane_b32 s22, v22, 42 -; GFX9-NEXT: s_and_b32 s21, s21, 0xff -; GFX9-NEXT: s_lshl_b32 s22, s22, 8 +; GFX9-NEXT: s_and_b32 s21, s65, 0xff +; GFX9-NEXT: s_lshl_b32 s22, s64, 8 ; GFX9-NEXT: s_or_b32 s21, s21, s22 ; GFX9-NEXT: s_and_b32 s20, s20, 0xffff ; GFX9-NEXT: s_lshl_b32 s21, s21, 16 ; GFX9-NEXT: s_or_b32 s20, s20, s21 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: v_mov_b32_e32 v1, s20 -; GFX9-NEXT: v_readlane_b32 s20, v22, 41 ; GFX9-NEXT: s_and_b32 s18, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s20, s20, 8 +; GFX9-NEXT: s_lshl_b32 s20, s76, 8 +; GFX9-NEXT: v_readlane_b32 s22, v22, 2 ; GFX9-NEXT: s_or_b32 s18, s18, s20 -; GFX9-NEXT: v_readlane_b32 s20, v22, 40 -; GFX9-NEXT: s_and_b32 s20, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s21, s76, 8 +; GFX9-NEXT: s_and_b32 s20, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s22, 8 ; GFX9-NEXT: s_or_b32 s20, s20, s21 ; GFX9-NEXT: s_and_b32 s18, s18, 0xffff ; GFX9-NEXT: s_lshl_b32 s20, s20, 16 @@ -10347,11 +10299,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: s_and_b32 s18, s19, 0xff -; GFX9-NEXT: v_readlane_b32 s19, v22, 39 +; GFX9-NEXT: v_readlane_b32 s19, v22, 44 ; GFX9-NEXT: s_lshl_b32 s19, s19, 8 ; GFX9-NEXT: s_or_b32 s18, s18, s19 -; GFX9-NEXT: v_readlane_b32 s19, v22, 38 -; GFX9-NEXT: v_readlane_b32 s20, v22, 37 +; GFX9-NEXT: v_readlane_b32 s19, v22, 43 +; GFX9-NEXT: v_readlane_b32 s20, v22, 42 ; GFX9-NEXT: s_and_b32 s19, s19, 0xff ; GFX9-NEXT: s_lshl_b32 s20, s20, 8 ; GFX9-NEXT: s_or_b32 s19, s19, s20 @@ -10360,13 +10312,14 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s18, s18, s19 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_readlane_b32 s18, v22, 36 +; GFX9-NEXT: v_readlane_b32 s18, v22, 41 ; GFX9-NEXT: s_and_b32 s16, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s18, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s18 -; GFX9-NEXT: v_readlane_b32 s18, v22, 35 +; GFX9-NEXT: v_readlane_b32 s18, v22, 40 +; GFX9-NEXT: v_readlane_b32 s20, v22, 4 ; GFX9-NEXT: s_and_b32 s18, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s19, s74, 8 +; GFX9-NEXT: s_lshl_b32 s19, s20, 8 ; GFX9-NEXT: s_or_b32 s18, s18, s19 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s18, s18, 16 @@ -10374,11 +10327,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s16, s17, 0xff -; GFX9-NEXT: v_readlane_b32 s17, v22, 34 +; GFX9-NEXT: v_readlane_b32 s17, v22, 39 ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v22, 33 -; GFX9-NEXT: v_readlane_b32 s18, v22, 32 +; GFX9-NEXT: v_readlane_b32 s17, v22, 38 +; GFX9-NEXT: v_readlane_b32 s18, v22, 37 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s18, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 @@ -10387,13 +10340,14 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: v_readlane_b32 s16, v22, 31 +; GFX9-NEXT: v_readlane_b32 s16, v22, 36 ; GFX9-NEXT: s_and_b32 s14, s14, 0xff ; GFX9-NEXT: s_lshl_b32 s16, s16, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s16 -; GFX9-NEXT: v_readlane_b32 s16, v22, 30 +; GFX9-NEXT: v_readlane_b32 s16, v22, 35 +; GFX9-NEXT: v_readlane_b32 s18, v22, 6 ; GFX9-NEXT: s_and_b32 s16, s16, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s72, 8 +; GFX9-NEXT: s_lshl_b32 s17, s18, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: s_and_b32 s14, s14, 0xffff ; GFX9-NEXT: s_lshl_b32 s16, s16, 16 @@ -10401,11 +10355,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-NEXT: s_and_b32 s14, s15, 0xff -; GFX9-NEXT: v_readlane_b32 s15, v22, 29 +; GFX9-NEXT: v_readlane_b32 s15, v22, 34 ; GFX9-NEXT: s_lshl_b32 s15, s15, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s15 -; GFX9-NEXT: v_readlane_b32 s15, v22, 28 -; GFX9-NEXT: v_readlane_b32 s16, v22, 27 +; GFX9-NEXT: v_readlane_b32 s15, v22, 33 +; GFX9-NEXT: v_readlane_b32 s16, v22, 32 ; GFX9-NEXT: s_and_b32 s15, s15, 0xff ; GFX9-NEXT: s_lshl_b32 s16, s16, 8 ; GFX9-NEXT: s_or_b32 s15, s15, s16 @@ -10414,13 +10368,14 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s14, s14, s15 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_readlane_b32 s14, v22, 26 +; GFX9-NEXT: v_readlane_b32 s14, v22, 31 ; GFX9-NEXT: s_and_b32 s12, s12, 0xff ; GFX9-NEXT: s_lshl_b32 s14, s14, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s14 -; GFX9-NEXT: v_readlane_b32 s14, v22, 25 +; GFX9-NEXT: v_readlane_b32 s14, v22, 30 +; GFX9-NEXT: v_readlane_b32 s16, v22, 8 ; GFX9-NEXT: s_and_b32 s14, s14, 0xff -; GFX9-NEXT: s_lshl_b32 s15, s62, 8 +; GFX9-NEXT: s_lshl_b32 s15, s16, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s15 ; GFX9-NEXT: s_and_b32 s12, s12, 0xffff ; GFX9-NEXT: s_lshl_b32 s14, s14, 16 @@ -10428,11 +10383,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-NEXT: s_and_b32 s12, s13, 0xff -; GFX9-NEXT: v_readlane_b32 s13, v22, 24 +; GFX9-NEXT: v_readlane_b32 s13, v22, 29 ; GFX9-NEXT: s_lshl_b32 s13, s13, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s13 -; GFX9-NEXT: v_readlane_b32 s13, v22, 23 -; GFX9-NEXT: v_readlane_b32 s14, v22, 22 +; GFX9-NEXT: v_readlane_b32 s13, v22, 28 +; GFX9-NEXT: v_readlane_b32 s14, v22, 27 ; GFX9-NEXT: s_and_b32 s13, s13, 0xff ; GFX9-NEXT: s_lshl_b32 s14, s14, 8 ; GFX9-NEXT: s_or_b32 s13, s13, s14 @@ -10441,13 +10396,14 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_readlane_b32 s12, v22, 21 +; GFX9-NEXT: v_readlane_b32 s12, v22, 26 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshl_b32 s12, s12, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s12 -; GFX9-NEXT: v_readlane_b32 s12, v22, 20 +; GFX9-NEXT: v_readlane_b32 s12, v22, 25 +; GFX9-NEXT: v_readlane_b32 s14, v22, 10 ; GFX9-NEXT: s_and_b32 s12, s12, 0xff -; GFX9-NEXT: s_lshl_b32 s13, s60, 8 +; GFX9-NEXT: s_lshl_b32 s13, s14, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: s_and_b32 s10, s10, 0xffff ; GFX9-NEXT: s_lshl_b32 s12, s12, 16 @@ -10455,11 +10411,11 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: s_and_b32 s10, s11, 0xff -; GFX9-NEXT: v_readlane_b32 s11, v22, 19 +; GFX9-NEXT: v_readlane_b32 s11, v22, 24 ; GFX9-NEXT: s_lshl_b32 s11, s11, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s11 -; GFX9-NEXT: v_readlane_b32 s11, v22, 18 -; GFX9-NEXT: v_readlane_b32 s12, v22, 17 +; GFX9-NEXT: v_readlane_b32 s11, v22, 23 +; GFX9-NEXT: v_readlane_b32 s12, v22, 22 ; GFX9-NEXT: s_and_b32 s11, s11, 0xff ; GFX9-NEXT: s_lshl_b32 s12, s12, 8 ; GFX9-NEXT: s_or_b32 s11, s11, s12 @@ -10468,13 +10424,14 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: v_readlane_b32 s10, v22, 16 +; GFX9-NEXT: v_readlane_b32 s10, v22, 21 ; GFX9-NEXT: s_and_b32 s8, s8, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s10 -; GFX9-NEXT: v_readlane_b32 s10, v22, 15 +; GFX9-NEXT: v_readlane_b32 s10, v22, 20 +; GFX9-NEXT: v_readlane_b32 s12, v22, 12 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s58, 8 +; GFX9-NEXT: s_lshl_b32 s11, s12, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 @@ -10482,26 +10439,24 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: s_and_b32 s8, s9, 0xff -; GFX9-NEXT: v_readlane_b32 s9, v22, 14 +; GFX9-NEXT: v_readlane_b32 s9, v22, 19 ; GFX9-NEXT: s_lshl_b32 s9, s9, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 -; GFX9-NEXT: v_readlane_b32 s9, v22, 13 -; GFX9-NEXT: v_readlane_b32 s10, v22, 12 +; GFX9-NEXT: v_readlane_b32 s9, v22, 18 ; GFX9-NEXT: s_and_b32 s9, s9, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_lshl_b32 s10, s54, 8 ; GFX9-NEXT: s_or_b32 s9, s9, s10 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff ; GFX9-NEXT: s_lshl_b32 s9, s9, 16 ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_readlane_b32 s8, v22, 11 ; GFX9-NEXT: s_and_b32 s6, s6, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_lshl_b32 s8, s55, 8 +; GFX9-NEXT: v_readlane_b32 s10, v22, 14 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_readlane_b32 s8, v22, 10 -; GFX9-NEXT: s_and_b32 s8, s8, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s28, 8 +; GFX9-NEXT: s_and_b32 s8, s53, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 @@ -10509,26 +10464,21 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_and_b32 s6, s7, 0xff -; GFX9-NEXT: v_readlane_b32 s7, v22, 9 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_lshl_b32 s7, s36, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_readlane_b32 s7, v22, 8 -; GFX9-NEXT: v_readlane_b32 s8, v22, 7 -; GFX9-NEXT: s_and_b32 s7, s7, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_and_b32 s7, s37, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s35, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_readlane_b32 s6, v22, 6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshl_b32 s6, s31, 8 +; GFX9-NEXT: v_readlane_b32 s8, v22, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: v_readlane_b32 s6, v22, 5 -; GFX9-NEXT: v_readlane_b32 s8, v22, 0 -; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_and_b32 s6, s95, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s8, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -10537,20 +10487,25 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_and_b32 s4, s5, 0xff -; GFX9-NEXT: v_readlane_b32 s5, v22, 4 -; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_lshl_b32 s5, s93, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_readlane_b32 s5, v22, 3 -; GFX9-NEXT: v_readlane_b32 s6, v22, 2 -; GFX9-NEXT: s_and_b32 s5, s5, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_and_b32 s5, s91, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s89, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_readlane_b32 s9, v22, 1 +; GFX9-NEXT: v_readlane_b32 s25, v22, 1 +; GFX9-NEXT: v_readlane_b32 s23, v22, 3 +; GFX9-NEXT: v_readlane_b32 s21, v22, 5 +; GFX9-NEXT: v_readlane_b32 s19, v22, 7 +; GFX9-NEXT: v_readlane_b32 s17, v22, 9 +; GFX9-NEXT: v_readlane_b32 s15, v22, 11 +; GFX9-NEXT: v_readlane_b32 s13, v22, 13 +; GFX9-NEXT: v_readlane_b32 s11, v22, 15 +; GFX9-NEXT: v_readlane_b32 s9, v22, 17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: v_readlane_b32 s99, v21, 35 ; GFX9-NEXT: v_readlane_b32 s98, v21, 34 @@ -10595,153 +10550,147 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB13_4: -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: v_writelane_b32 v22, s82, 0 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr80 -; GFX9-NEXT: ; implicit-def: $sgpr71 -; GFX9-NEXT: ; implicit-def: $sgpr70 -; GFX9-NEXT: ; implicit-def: $sgpr69 -; GFX9-NEXT: ; implicit-def: $sgpr68 -; GFX9-NEXT: ; implicit-def: $sgpr67 -; GFX9-NEXT: ; implicit-def: $sgpr66 -; GFX9-NEXT: ; implicit-def: $sgpr65 -; GFX9-NEXT: ; implicit-def: $sgpr64 -; GFX9-NEXT: ; implicit-def: $sgpr55 -; GFX9-NEXT: ; implicit-def: $sgpr54 -; GFX9-NEXT: ; implicit-def: $sgpr53 -; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: v_writelane_b32 v22, s76, 0 +; GFX9-NEXT: v_writelane_b32 v22, s77, 1 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 2 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 3 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 4 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 5 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 6 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 7 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 8 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 9 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 10 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 11 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 12 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 13 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 14 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 15 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 16 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 17 ; GFX9-NEXT: ; implicit-def: $sgpr51 ; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 ; GFX9-NEXT: ; implicit-def: $sgpr49 ; GFX9-NEXT: ; implicit-def: $sgpr48 ; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 ; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr99 ; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr97 ; GFX9-NEXT: ; implicit-def: $sgpr96 ; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr86 ; GFX9-NEXT: ; implicit-def: $sgpr85 ; GFX9-NEXT: ; implicit-def: $sgpr84 -; GFX9-NEXT: ; implicit-def: $sgpr81 -; GFX9-NEXT: ; implicit-def: $sgpr83 -; GFX9-NEXT: ; implicit-def: $sgpr36 -; GFX9-NEXT: ; implicit-def: $sgpr34 -; GFX9-NEXT: ; implicit-def: $sgpr30 -; GFX9-NEXT: ; implicit-def: $sgpr94 ; GFX9-NEXT: ; implicit-def: $sgpr92 -; GFX9-NEXT: ; implicit-def: $sgpr90 -; GFX9-NEXT: ; implicit-def: $sgpr88 -; GFX9-NEXT: ; implicit-def: $sgpr78 -; GFX9-NEXT: ; implicit-def: $sgpr76 -; GFX9-NEXT: ; implicit-def: $sgpr74 -; GFX9-NEXT: ; implicit-def: $sgpr72 -; GFX9-NEXT: ; implicit-def: $sgpr62 -; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 ; GFX9-NEXT: ; implicit-def: $sgpr58 -; GFX9-NEXT: ; implicit-def: $sgpr28 -; GFX9-NEXT: v_writelane_b32 v22, s83, 1 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr83 ; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr88 ; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; kill: killed $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr89 ; GFX9-NEXT: s_branch .LBB13_2 ; ; GFX11-LABEL: bitcast_v32i32_to_v128i8_scalar: @@ -10776,21 +10725,21 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 ; GFX11-NEXT: v_writelane_b32 v34, s37, 5 ; GFX11-NEXT: v_writelane_b32 v35, s101, 5 -; GFX11-NEXT: v_readfirstlane_b32 s40, v16 -; GFX11-NEXT: v_readfirstlane_b32 s41, v17 -; GFX11-NEXT: v_readfirstlane_b32 s28, v18 +; GFX11-NEXT: v_readfirstlane_b32 s56, v16 +; GFX11-NEXT: v_readfirstlane_b32 s57, v17 +; GFX11-NEXT: v_readfirstlane_b32 s46, v18 ; GFX11-NEXT: v_writelane_b32 v34, s38, 6 ; GFX11-NEXT: v_writelane_b32 v35, s102, 6 -; GFX11-NEXT: v_readfirstlane_b32 s29, v19 -; GFX11-NEXT: v_readfirstlane_b32 s26, v20 -; GFX11-NEXT: v_readfirstlane_b32 s27, v21 +; GFX11-NEXT: v_readfirstlane_b32 s47, v19 +; GFX11-NEXT: v_readfirstlane_b32 s42, v20 +; GFX11-NEXT: v_readfirstlane_b32 s43, v21 ; GFX11-NEXT: v_writelane_b32 v34, s39, 7 ; GFX11-NEXT: v_writelane_b32 v35, s103, 7 -; GFX11-NEXT: v_readfirstlane_b32 s24, v22 -; GFX11-NEXT: v_readfirstlane_b32 s25, v23 -; GFX11-NEXT: v_readfirstlane_b32 s22, v24 +; GFX11-NEXT: v_readfirstlane_b32 s28, v22 +; GFX11-NEXT: v_readfirstlane_b32 s29, v23 +; GFX11-NEXT: v_readfirstlane_b32 s24, v24 ; GFX11-NEXT: v_writelane_b32 v34, s48, 8 -; GFX11-NEXT: v_readfirstlane_b32 s23, v25 +; GFX11-NEXT: v_readfirstlane_b32 s25, v25 ; GFX11-NEXT: v_readfirstlane_b32 s20, v26 ; GFX11-NEXT: v_readfirstlane_b32 s21, v27 ; GFX11-NEXT: v_readfirstlane_b32 s18, v28 @@ -10818,9 +10767,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: v_readfirstlane_b32 s3, v12 ; GFX11-NEXT: v_readfirstlane_b32 s0, v13 ; GFX11-NEXT: v_readfirstlane_b32 s1, v14 -; GFX11-NEXT: s_mov_b32 s101, 0 +; GFX11-NEXT: s_mov_b32 s60, 0 ; GFX11-NEXT: v_writelane_b32 v34, s54, 14 -; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s22, vcc_lo, exec_lo ; GFX11-NEXT: v_writelane_b32 v35, s104, 8 ; GFX11-NEXT: ; implicit-def: $vgpr37 : SGPR spill to VGPR lane ; GFX11-NEXT: ; implicit-def: $vgpr36 : SGPR spill to VGPR lane @@ -10841,789 +10790,610 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: v_writelane_b32 v34, s85, 29 ; GFX11-NEXT: v_writelane_b32 v34, s86, 30 ; GFX11-NEXT: v_writelane_b32 v34, s87, 31 -; GFX11-NEXT: s_cbranch_scc0 .LBB13_2 +; GFX11-NEXT: s_cbranch_scc0 .LBB13_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s43, s19, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v37, s43, 16 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s104, s1, 24 -; GFX11-NEXT: s_lshr_b32 s102, s1, 16 -; GFX11-NEXT: s_lshr_b32 s103, s1, 8 -; GFX11-NEXT: v_writelane_b32 v37, s43, 17 -; GFX11-NEXT: s_lshr_b32 s43, s18, 8 -; GFX11-NEXT: s_lshr_b32 s57, s0, 16 -; GFX11-NEXT: s_lshr_b32 s47, s0, 8 -; GFX11-NEXT: s_lshr_b32 s46, s3, 24 -; GFX11-NEXT: v_writelane_b32 v37, s43, 18 -; GFX11-NEXT: s_lshr_b32 s43, s21, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s3, 16 -; GFX11-NEXT: s_lshr_b32 s34, s3, 8 -; GFX11-NEXT: s_lshr_b32 s69, s2, 16 -; GFX11-NEXT: v_writelane_b32 v37, s43, 19 -; GFX11-NEXT: s_lshr_b32 s43, s21, 16 -; GFX11-NEXT: s_lshr_b32 s56, s2, 8 -; GFX11-NEXT: s_lshr_b32 s35, s5, 24 -; GFX11-NEXT: s_lshr_b32 s36, s5, 16 -; GFX11-NEXT: v_writelane_b32 v37, s43, 20 -; GFX11-NEXT: s_lshr_b32 s43, s21, 8 -; GFX11-NEXT: s_lshr_b32 s37, s5, 8 -; GFX11-NEXT: s_lshr_b32 s38, s4, 16 -; GFX11-NEXT: s_lshr_b32 s39, s4, 8 -; GFX11-NEXT: v_writelane_b32 v37, s43, 21 -; GFX11-NEXT: s_lshr_b32 s43, s20, 16 -; GFX11-NEXT: s_lshr_b32 s48, s7, 24 -; GFX11-NEXT: s_lshr_b32 s49, s7, 16 -; GFX11-NEXT: s_lshr_b32 s50, s7, 8 -; GFX11-NEXT: v_writelane_b32 v37, s43, 22 -; GFX11-NEXT: s_lshr_b32 s43, s20, 8 -; GFX11-NEXT: s_lshr_b32 s51, s6, 16 -; GFX11-NEXT: s_lshr_b32 s52, s6, 8 -; GFX11-NEXT: s_lshr_b32 s53, s9, 24 -; GFX11-NEXT: v_writelane_b32 v37, s43, 23 -; GFX11-NEXT: s_lshr_b32 s43, s23, 24 -; GFX11-NEXT: s_lshr_b32 s54, s9, 16 -; GFX11-NEXT: s_lshr_b32 s55, s9, 8 -; GFX11-NEXT: s_lshr_b32 s64, s8, 16 -; GFX11-NEXT: v_writelane_b32 v37, s43, 24 -; GFX11-NEXT: s_lshr_b32 s43, s23, 16 -; GFX11-NEXT: s_lshr_b32 s65, s8, 8 -; GFX11-NEXT: s_lshr_b32 s66, s11, 24 -; GFX11-NEXT: s_lshr_b32 s67, s11, 16 -; GFX11-NEXT: v_writelane_b32 v37, s43, 25 -; GFX11-NEXT: s_lshr_b32 s43, s23, 8 -; GFX11-NEXT: s_lshr_b32 s68, s11, 8 -; GFX11-NEXT: s_lshr_b32 s59, s10, 16 -; GFX11-NEXT: s_lshr_b32 s58, s10, 8 -; GFX11-NEXT: v_writelane_b32 v37, s43, 26 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s70, s13, 24 -; GFX11-NEXT: s_lshr_b32 s71, s13, 16 -; GFX11-NEXT: s_lshr_b32 s60, s13, 8 -; GFX11-NEXT: v_writelane_b32 v37, s43, 27 -; GFX11-NEXT: s_lshr_b32 s43, s22, 8 -; GFX11-NEXT: s_lshr_b32 s80, s12, 16 -; GFX11-NEXT: s_lshr_b32 s61, s12, 8 -; GFX11-NEXT: s_lshr_b32 s81, s15, 24 -; GFX11-NEXT: v_writelane_b32 v37, s43, 28 -; GFX11-NEXT: s_lshr_b32 s43, s25, 24 -; GFX11-NEXT: s_lshr_b32 s82, s15, 16 -; GFX11-NEXT: s_lshr_b32 s83, s15, 8 -; GFX11-NEXT: s_lshr_b32 s84, s14, 16 -; GFX11-NEXT: v_writelane_b32 v37, s43, 29 -; GFX11-NEXT: s_lshr_b32 s43, s25, 16 -; GFX11-NEXT: s_lshr_b32 s85, s14, 8 -; GFX11-NEXT: s_lshr_b32 s86, s17, 24 -; GFX11-NEXT: s_lshr_b32 s72, s17, 16 -; GFX11-NEXT: v_writelane_b32 v37, s43, 30 -; GFX11-NEXT: s_lshr_b32 s43, s25, 8 -; GFX11-NEXT: s_lshr_b32 s87, s17, 8 -; GFX11-NEXT: s_lshr_b32 s73, s16, 16 -; GFX11-NEXT: s_lshr_b32 s96, s16, 8 -; GFX11-NEXT: v_writelane_b32 v37, s43, 31 -; GFX11-NEXT: s_lshr_b32 s43, s24, 16 -; GFX11-NEXT: s_lshr_b32 s97, s19, 24 -; GFX11-NEXT: v_writelane_b32 v36, s43, 0 -; GFX11-NEXT: s_lshr_b32 s43, s24, 8 -; GFX11-NEXT: v_writelane_b32 v37, s62, 14 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s74, s28, 16 -; GFX11-NEXT: v_writelane_b32 v36, s43, 1 -; GFX11-NEXT: s_lshr_b32 s43, s27, 24 -; GFX11-NEXT: v_writelane_b32 v37, s63, 15 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[2:3], 24 -; GFX11-NEXT: s_lshr_b32 s98, s41, 24 -; GFX11-NEXT: v_writelane_b32 v36, s43, 2 -; GFX11-NEXT: s_lshr_b32 s43, s27, 16 -; GFX11-NEXT: v_writelane_b32 v37, s62, 12 -; GFX11-NEXT: s_lshr_b32 s99, s41, 16 -; GFX11-NEXT: s_lshr_b32 s100, s41, 8 -; GFX11-NEXT: v_writelane_b32 v36, s43, 3 -; GFX11-NEXT: s_lshr_b32 s43, s27, 8 -; GFX11-NEXT: v_writelane_b32 v37, s63, 13 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; GFX11-NEXT: s_lshr_b32 s44, s40, 16 -; GFX11-NEXT: v_writelane_b32 v36, s43, 4 -; GFX11-NEXT: s_lshr_b32 s43, s26, 16 -; GFX11-NEXT: v_writelane_b32 v37, s62, 10 -; GFX11-NEXT: s_lshr_b32 s45, s40, 8 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 -; GFX11-NEXT: v_writelane_b32 v36, s43, 5 -; GFX11-NEXT: s_lshr_b32 s43, s26, 8 -; GFX11-NEXT: v_writelane_b32 v37, s63, 11 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[18:19], 24 -; GFX11-NEXT: v_writelane_b32 v36, s43, 6 -; GFX11-NEXT: s_lshr_b32 s43, s29, 24 -; GFX11-NEXT: v_writelane_b32 v37, s62, 8 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[22:23], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; GFX11-NEXT: v_writelane_b32 v36, s43, 7 -; GFX11-NEXT: s_lshr_b32 s43, s29, 16 -; GFX11-NEXT: v_writelane_b32 v37, s63, 9 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[26:27], 24 -; GFX11-NEXT: v_writelane_b32 v36, s43, 8 -; GFX11-NEXT: s_lshr_b32 s43, s29, 8 -; GFX11-NEXT: v_writelane_b32 v37, s62, 6 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[28:29], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v36, s43, 9 -; GFX11-NEXT: s_lshr_b32 s43, s28, 8 -; GFX11-NEXT: v_writelane_b32 v37, s63, 7 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX11-NEXT: s_lshr_b32 s22, s1, 24 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[0:1], 24 +; GFX11-NEXT: v_writelane_b32 v37, s22, 14 +; GFX11-NEXT: s_lshr_b32 s22, s4, 16 +; GFX11-NEXT: s_lshr_b32 s74, s1, 16 +; GFX11-NEXT: s_lshr_b32 s76, s1, 8 +; GFX11-NEXT: s_lshr_b32 s78, s0, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 15 +; GFX11-NEXT: s_lshr_b32 s22, s4, 8 +; GFX11-NEXT: s_lshr_b32 s88, s0, 8 +; GFX11-NEXT: s_lshr_b32 s90, s3, 24 +; GFX11-NEXT: s_lshr_b32 s92, s3, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 16 +; GFX11-NEXT: s_lshr_b32 s22, s7, 24 +; GFX11-NEXT: s_lshr_b32 s94, s3, 8 +; GFX11-NEXT: s_lshr_b32 vcc_lo, s2, 16 +; GFX11-NEXT: s_lshr_b32 s61, s5, 8 +; GFX11-NEXT: v_writelane_b32 v37, s22, 17 +; GFX11-NEXT: s_lshr_b32 s22, s7, 16 +; GFX11-NEXT: s_lshr_b64 s[98:99], s[56:57], 24 +; GFX11-NEXT: s_lshr_b32 s31, s2, 8 +; GFX11-NEXT: s_lshr_b32 s97, s5, 24 +; GFX11-NEXT: v_writelane_b32 v37, s22, 18 +; GFX11-NEXT: s_lshr_b32 s22, s7, 8 +; GFX11-NEXT: s_lshr_b32 s63, s5, 16 +; GFX11-NEXT: s_lshr_b32 s30, s13, 16 +; GFX11-NEXT: s_lshr_b32 s87, s13, 8 +; GFX11-NEXT: v_writelane_b32 v37, s22, 19 +; GFX11-NEXT: s_lshr_b32 s22, s6, 16 +; GFX11-NEXT: s_lshr_b32 s62, s12, 16 +; GFX11-NEXT: s_lshr_b32 s96, s12, 8 +; GFX11-NEXT: s_lshr_b32 s100, s15, 24 +; GFX11-NEXT: v_writelane_b32 v37, s22, 20 +; GFX11-NEXT: s_lshr_b32 s22, s6, 8 +; GFX11-NEXT: s_lshr_b32 s101, s15, 16 +; GFX11-NEXT: s_lshr_b32 s102, s15, 8 +; GFX11-NEXT: s_lshr_b32 s103, s14, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 21 +; GFX11-NEXT: s_lshr_b32 s22, s9, 24 +; GFX11-NEXT: s_lshr_b32 s104, s14, 8 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s17, 24 +; GFX11-NEXT: s_lshr_b32 s34, s17, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 22 +; GFX11-NEXT: s_lshr_b32 s22, s9, 16 +; GFX11-NEXT: s_lshr_b32 s35, s17, 8 +; GFX11-NEXT: s_lshr_b32 s36, s16, 16 +; GFX11-NEXT: s_lshr_b32 s37, s16, 8 +; GFX11-NEXT: v_writelane_b32 v37, s22, 23 +; GFX11-NEXT: s_lshr_b32 s22, s9, 8 +; GFX11-NEXT: s_lshr_b32 s38, s19, 24 +; GFX11-NEXT: s_lshr_b32 s39, s19, 16 +; GFX11-NEXT: s_lshr_b32 s48, s19, 8 +; GFX11-NEXT: v_writelane_b32 v37, s22, 24 +; GFX11-NEXT: s_lshr_b32 s22, s8, 16 +; GFX11-NEXT: s_lshr_b32 s49, s18, 16 +; GFX11-NEXT: s_lshr_b32 s50, s18, 8 +; GFX11-NEXT: s_lshr_b32 s51, s21, 24 +; GFX11-NEXT: v_writelane_b32 v37, s22, 25 +; GFX11-NEXT: s_lshr_b32 s22, s8, 8 +; GFX11-NEXT: s_lshr_b32 s52, s21, 16 +; GFX11-NEXT: s_lshr_b32 s53, s21, 8 +; GFX11-NEXT: s_lshr_b32 s54, s20, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 26 +; GFX11-NEXT: s_lshr_b32 s22, s11, 24 +; GFX11-NEXT: s_lshr_b32 s55, s20, 8 +; GFX11-NEXT: s_lshr_b32 s64, s25, 24 +; GFX11-NEXT: s_lshr_b32 s65, s25, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 27 +; GFX11-NEXT: s_lshr_b32 s22, s11, 16 +; GFX11-NEXT: s_lshr_b32 s23, s24, 16 +; GFX11-NEXT: s_lshr_b32 s66, s24, 8 +; GFX11-NEXT: s_lshr_b32 s67, s29, 24 +; GFX11-NEXT: v_writelane_b32 v37, s22, 28 +; GFX11-NEXT: s_lshr_b32 s22, s11, 8 +; GFX11-NEXT: s_lshr_b32 s68, s29, 16 +; GFX11-NEXT: s_lshr_b32 s26, s29, 8 +; GFX11-NEXT: s_lshr_b32 s27, s28, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 29 +; GFX11-NEXT: s_lshr_b32 s22, s10, 16 +; GFX11-NEXT: s_lshr_b32 s40, s28, 8 +; GFX11-NEXT: s_lshr_b32 s41, s43, 24 +; GFX11-NEXT: s_lshr_b32 s44, s43, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 30 +; GFX11-NEXT: s_lshr_b32 s22, s10, 8 +; GFX11-NEXT: s_lshr_b32 s45, s43, 8 +; GFX11-NEXT: s_lshr_b32 s69, s42, 16 +; GFX11-NEXT: s_lshr_b32 s70, s42, 8 +; GFX11-NEXT: v_writelane_b32 v37, s22, 31 +; GFX11-NEXT: s_lshr_b32 s22, s13, 24 +; GFX11-NEXT: s_lshr_b32 s71, s47, 24 +; GFX11-NEXT: v_writelane_b32 v36, s22, 0 +; GFX11-NEXT: s_lshr_b32 s22, s25, 8 +; GFX11-NEXT: v_writelane_b32 v37, s72, 12 +; GFX11-NEXT: s_lshr_b32 s80, s47, 16 +; GFX11-NEXT: s_lshr_b32 s81, s47, 8 +; GFX11-NEXT: s_lshr_b32 s82, s46, 16 +; GFX11-NEXT: s_lshr_b32 s83, s46, 8 +; GFX11-NEXT: v_writelane_b32 v37, s73, 13 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[2:3], 24 +; GFX11-NEXT: s_lshr_b32 s84, s57, 24 +; GFX11-NEXT: s_lshr_b32 s58, s57, 16 +; GFX11-NEXT: s_lshr_b32 s59, s57, 8 +; GFX11-NEXT: v_writelane_b32 v37, s72, 10 +; GFX11-NEXT: s_lshr_b32 s85, s56, 16 +; GFX11-NEXT: s_lshr_b32 s86, s56, 8 +; GFX11-NEXT: s_mov_b32 s99, s61 +; GFX11-NEXT: v_writelane_b32 v37, s73, 11 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[4:5], 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v37, s62, 4 -; GFX11-NEXT: v_writelane_b32 v37, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX11-NEXT: v_writelane_b32 v37, s62, 2 -; GFX11-NEXT: v_writelane_b32 v37, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; GFX11-NEXT: v_writelane_b32 v37, s72, 8 +; GFX11-NEXT: v_writelane_b32 v37, s73, 9 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[6:7], 24 +; GFX11-NEXT: v_writelane_b32 v37, s72, 6 +; GFX11-NEXT: v_writelane_b32 v37, s73, 7 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[8:9], 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v37, s72, 4 +; GFX11-NEXT: v_writelane_b32 v37, s73, 5 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 +; GFX11-NEXT: v_writelane_b32 v37, s72, 2 +; GFX11-NEXT: v_writelane_b32 v37, s73, 3 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[12:13], 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v37, s62, 0 -; GFX11-NEXT: v_writelane_b32 v37, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 -; GFX11-NEXT: s_branch .LBB13_3 -; GFX11-NEXT: .LBB13_2: -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 0 -; GFX11-NEXT: ; implicit-def: $vcc_hi -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 1 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 2 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 3 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 4 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: s_mov_b32 s101, -1 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr45 -; GFX11-NEXT: ; implicit-def: $sgpr44 -; GFX11-NEXT: ; implicit-def: $sgpr30 -; GFX11-NEXT: ; implicit-def: $sgpr100 -; GFX11-NEXT: ; implicit-def: $sgpr99 -; GFX11-NEXT: ; implicit-def: $sgpr98 -; GFX11-NEXT: ; implicit-def: $sgpr43 -; GFX11-NEXT: ; implicit-def: $sgpr74 -; GFX11-NEXT: ; implicit-def: $sgpr94 -; GFX11-NEXT: ; implicit-def: $sgpr92 -; GFX11-NEXT: ; implicit-def: $sgpr90 -; GFX11-NEXT: ; implicit-def: $sgpr78 -; GFX11-NEXT: ; implicit-def: $sgpr62 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr97 -; GFX11-NEXT: ; implicit-def: $sgpr96 -; GFX11-NEXT: ; implicit-def: $sgpr73 -; GFX11-NEXT: ; implicit-def: $sgpr87 -; GFX11-NEXT: ; implicit-def: $sgpr72 -; GFX11-NEXT: ; implicit-def: $sgpr86 -; GFX11-NEXT: ; implicit-def: $sgpr85 -; GFX11-NEXT: ; implicit-def: $sgpr84 -; GFX11-NEXT: ; implicit-def: $sgpr83 -; GFX11-NEXT: ; implicit-def: $sgpr82 -; GFX11-NEXT: ; implicit-def: $sgpr81 -; GFX11-NEXT: ; implicit-def: $sgpr61 -; GFX11-NEXT: ; implicit-def: $sgpr80 -; GFX11-NEXT: ; implicit-def: $sgpr60 -; GFX11-NEXT: ; implicit-def: $sgpr71 -; GFX11-NEXT: ; implicit-def: $sgpr70 -; GFX11-NEXT: ; implicit-def: $sgpr58 -; GFX11-NEXT: ; implicit-def: $sgpr59 -; GFX11-NEXT: ; implicit-def: $sgpr68 -; GFX11-NEXT: ; implicit-def: $sgpr67 -; GFX11-NEXT: ; implicit-def: $sgpr66 -; GFX11-NEXT: ; implicit-def: $sgpr65 -; GFX11-NEXT: ; implicit-def: $sgpr64 -; GFX11-NEXT: ; implicit-def: $sgpr55 -; GFX11-NEXT: ; implicit-def: $sgpr54 -; GFX11-NEXT: ; implicit-def: $sgpr53 -; GFX11-NEXT: ; implicit-def: $sgpr52 -; GFX11-NEXT: ; implicit-def: $sgpr51 -; GFX11-NEXT: ; implicit-def: $sgpr50 -; GFX11-NEXT: ; implicit-def: $sgpr49 -; GFX11-NEXT: ; implicit-def: $sgpr48 -; GFX11-NEXT: ; implicit-def: $sgpr39 -; GFX11-NEXT: ; implicit-def: $sgpr38 -; GFX11-NEXT: ; implicit-def: $sgpr37 -; GFX11-NEXT: ; implicit-def: $sgpr36 -; GFX11-NEXT: ; implicit-def: $sgpr35 -; GFX11-NEXT: ; implicit-def: $sgpr56 -; GFX11-NEXT: ; implicit-def: $sgpr69 -; GFX11-NEXT: ; implicit-def: $sgpr34 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr47 -; GFX11-NEXT: ; implicit-def: $sgpr57 -; GFX11-NEXT: ; implicit-def: $sgpr103 -; GFX11-NEXT: ; implicit-def: $sgpr102 -; GFX11-NEXT: ; implicit-def: $sgpr104 -; GFX11-NEXT: ; implicit-def: $sgpr88 -; GFX11-NEXT: ; implicit-def: $sgpr76 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 6 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 8 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 10 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 12 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 14 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 15 -; GFX11-NEXT: .LBB13_3: ; %Flow -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s101 -; GFX11-NEXT: s_mov_b32 s101, s104 -; GFX11-NEXT: s_mov_b32 s104, s57 -; GFX11-NEXT: s_mov_b32 s57, s69 -; GFX11-NEXT: s_mov_b32 s69, s42 -; GFX11-NEXT: s_cbranch_vccnz .LBB13_5 -; GFX11-NEXT: ; %bb.4: ; %cmp.true -; GFX11-NEXT: s_add_i32 s19, s19, 3 -; GFX11-NEXT: s_add_i32 s18, s18, 3 -; GFX11-NEXT: s_lshr_b32 s42, s19, 8 -; GFX11-NEXT: s_add_i32 s21, s21, 3 -; GFX11-NEXT: v_writelane_b32 v37, s42, 16 -; GFX11-NEXT: s_lshr_b32 s42, s18, 16 -; GFX11-NEXT: s_add_i32 s20, s20, 3 -; GFX11-NEXT: s_add_i32 s23, s23, 3 -; GFX11-NEXT: s_add_i32 s22, s22, 3 -; GFX11-NEXT: v_writelane_b32 v37, s42, 17 -; GFX11-NEXT: s_lshr_b32 s42, s18, 8 -; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: v_writelane_b32 v37, s72, 0 +; GFX11-NEXT: v_writelane_b32 v37, s73, 1 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX11-NEXT: s_mov_b32 s73, s74 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; GFX11-NEXT: s_mov_b32 s75, s76 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; GFX11-NEXT: s_mov_b32 s77, s78 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX11-NEXT: s_mov_b32 s79, s88 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 +; GFX11-NEXT: s_mov_b32 s89, s90 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[28:29], 24 +; GFX11-NEXT: s_mov_b32 s91, s92 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[42:43], 24 +; GFX11-NEXT: s_mov_b32 s93, s94 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[46:47], 24 +; GFX11-NEXT: s_mov_b32 s95, vcc_lo +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s60 +; GFX11-NEXT: s_cbranch_vccnz .LBB13_3 +; GFX11-NEXT: .LBB13_2: ; %cmp.true ; GFX11-NEXT: s_add_i32 s1, s1, 3 ; GFX11-NEXT: s_add_i32 s0, s0, 3 -; GFX11-NEXT: v_writelane_b32 v37, s42, 18 -; GFX11-NEXT: s_lshr_b32 s42, s21, 24 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[0:1], 24 ; GFX11-NEXT: s_add_i32 s3, s3, 3 +; GFX11-NEXT: s_lshr_b64 s[22:23], s[0:1], 24 ; GFX11-NEXT: s_add_i32 s2, s2, 3 -; GFX11-NEXT: v_writelane_b32 v37, s42, 19 -; GFX11-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 12 ; GFX11-NEXT: s_add_i32 s5, s5, 3 ; GFX11-NEXT: s_add_i32 s4, s4, 3 ; GFX11-NEXT: s_add_i32 s7, s7, 3 -; GFX11-NEXT: v_writelane_b32 v37, s42, 20 -; GFX11-NEXT: s_lshr_b32 s42, s21, 8 ; GFX11-NEXT: s_add_i32 s6, s6, 3 -; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: v_writelane_b32 v37, s23, 13 +; GFX11-NEXT: s_lshr_b64 s[22:23], s[2:3], 24 ; GFX11-NEXT: s_add_i32 s9, s9, 3 -; GFX11-NEXT: v_writelane_b32 v37, s42, 21 -; GFX11-NEXT: s_lshr_b32 s42, s20, 16 ; GFX11-NEXT: s_add_i32 s8, s8, 3 -; GFX11-NEXT: s_add_i32 s27, s27, 3 ; GFX11-NEXT: s_add_i32 s11, s11, 3 -; GFX11-NEXT: v_writelane_b32 v37, s42, 22 -; GFX11-NEXT: s_lshr_b32 s42, s20, 8 +; GFX11-NEXT: v_writelane_b32 v37, s22, 10 ; GFX11-NEXT: s_add_i32 s10, s10, 3 -; GFX11-NEXT: s_add_i32 s26, s26, 3 ; GFX11-NEXT: s_add_i32 s13, s13, 3 -; GFX11-NEXT: v_writelane_b32 v37, s42, 23 -; GFX11-NEXT: s_lshr_b32 s42, s23, 24 ; GFX11-NEXT: s_add_i32 s12, s12, 3 +; GFX11-NEXT: s_add_i32 s57, s57, 3 +; GFX11-NEXT: v_writelane_b32 v37, s23, 11 +; GFX11-NEXT: s_lshr_b64 s[22:23], s[4:5], 24 +; GFX11-NEXT: s_add_i32 s56, s56, 3 +; GFX11-NEXT: s_add_i32 s47, s47, 3 +; GFX11-NEXT: s_add_i32 s46, s46, 3 +; GFX11-NEXT: v_writelane_b32 v37, s22, 8 +; GFX11-NEXT: s_add_i32 s43, s43, 3 +; GFX11-NEXT: s_add_i32 s42, s42, 3 ; GFX11-NEXT: s_add_i32 s29, s29, 3 -; GFX11-NEXT: s_add_i32 s15, s15, 3 -; GFX11-NEXT: v_writelane_b32 v37, s42, 24 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: s_add_i32 s14, s14, 3 -; GFX11-NEXT: s_add_i32 s41, s41, 3 -; GFX11-NEXT: s_add_i32 s40, s40, 3 -; GFX11-NEXT: v_writelane_b32 v37, s42, 25 -; GFX11-NEXT: s_lshr_b32 s42, s23, 8 ; GFX11-NEXT: s_add_i32 s28, s28, 3 +; GFX11-NEXT: v_writelane_b32 v37, s23, 9 +; GFX11-NEXT: s_lshr_b64 s[22:23], s[6:7], 24 +; GFX11-NEXT: s_add_i32 s25, s25, 3 +; GFX11-NEXT: s_add_i32 s24, s24, 3 +; GFX11-NEXT: s_add_i32 s21, s21, 3 +; GFX11-NEXT: v_writelane_b32 v37, s22, 6 +; GFX11-NEXT: s_add_i32 s20, s20, 3 +; GFX11-NEXT: s_add_i32 s19, s19, 3 +; GFX11-NEXT: s_add_i32 s18, s18, 3 ; GFX11-NEXT: s_add_i32 s17, s17, 3 +; GFX11-NEXT: v_writelane_b32 v37, s23, 7 +; GFX11-NEXT: s_lshr_b64 s[22:23], s[8:9], 24 ; GFX11-NEXT: s_add_i32 s16, s16, 3 -; GFX11-NEXT: v_writelane_b32 v37, s42, 26 -; GFX11-NEXT: s_lshr_b32 s42, s22, 16 -; GFX11-NEXT: s_lshr_b32 s101, s1, 24 -; GFX11-NEXT: s_lshr_b32 s102, s1, 16 -; GFX11-NEXT: s_lshr_b32 s103, s1, 8 -; GFX11-NEXT: v_writelane_b32 v37, s42, 27 -; GFX11-NEXT: s_lshr_b32 s42, s22, 8 -; GFX11-NEXT: s_lshr_b32 s104, s0, 16 -; GFX11-NEXT: s_lshr_b32 s47, s0, 8 -; GFX11-NEXT: s_lshr_b32 s46, s3, 24 -; GFX11-NEXT: v_writelane_b32 v37, s42, 28 -; GFX11-NEXT: s_lshr_b32 s42, s25, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s3, 16 -; GFX11-NEXT: s_lshr_b32 s34, s3, 8 -; GFX11-NEXT: s_lshr_b32 s57, s2, 16 -; GFX11-NEXT: v_writelane_b32 v37, s42, 29 -; GFX11-NEXT: s_lshr_b32 s42, s25, 16 -; GFX11-NEXT: s_lshr_b32 s56, s2, 8 -; GFX11-NEXT: s_lshr_b32 s35, s5, 24 -; GFX11-NEXT: s_lshr_b32 s36, s5, 16 -; GFX11-NEXT: v_writelane_b32 v37, s42, 30 -; GFX11-NEXT: s_lshr_b32 s42, s25, 8 -; GFX11-NEXT: s_lshr_b32 s37, s5, 8 -; GFX11-NEXT: s_lshr_b32 s38, s4, 16 -; GFX11-NEXT: s_lshr_b32 s39, s4, 8 -; GFX11-NEXT: v_writelane_b32 v37, s42, 31 -; GFX11-NEXT: s_lshr_b32 s42, s24, 16 -; GFX11-NEXT: s_lshr_b32 s48, s7, 24 -; GFX11-NEXT: v_writelane_b32 v36, s42, 0 -; GFX11-NEXT: s_lshr_b32 s42, s24, 8 -; GFX11-NEXT: v_writelane_b32 v37, s62, 14 -; GFX11-NEXT: s_lshr_b32 s49, s7, 16 -; GFX11-NEXT: s_lshr_b32 s50, s7, 8 -; GFX11-NEXT: v_writelane_b32 v36, s42, 1 -; GFX11-NEXT: s_lshr_b32 s42, s27, 24 -; GFX11-NEXT: v_writelane_b32 v37, s63, 15 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[2:3], 24 -; GFX11-NEXT: s_lshr_b32 s51, s6, 16 -; GFX11-NEXT: v_writelane_b32 v36, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-NEXT: v_writelane_b32 v37, s62, 12 -; GFX11-NEXT: s_lshr_b32 s52, s6, 8 -; GFX11-NEXT: s_lshr_b32 s53, s9, 24 -; GFX11-NEXT: v_writelane_b32 v36, s42, 3 -; GFX11-NEXT: s_lshr_b32 s42, s27, 8 -; GFX11-NEXT: v_writelane_b32 v37, s63, 13 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; GFX11-NEXT: s_lshr_b32 s54, s9, 16 -; GFX11-NEXT: v_writelane_b32 v36, s42, 4 -; GFX11-NEXT: s_lshr_b32 s42, s26, 16 -; GFX11-NEXT: v_writelane_b32 v37, s62, 10 -; GFX11-NEXT: s_lshr_b32 s55, s9, 8 -; GFX11-NEXT: s_lshr_b32 s64, s8, 16 -; GFX11-NEXT: v_writelane_b32 v36, s42, 5 -; GFX11-NEXT: s_lshr_b32 s42, s26, 8 -; GFX11-NEXT: v_writelane_b32 v37, s63, 11 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 -; GFX11-NEXT: s_lshr_b32 s65, s8, 8 -; GFX11-NEXT: v_writelane_b32 v36, s42, 6 -; GFX11-NEXT: s_lshr_b32 s42, s29, 24 -; GFX11-NEXT: v_writelane_b32 v37, s62, 8 -; GFX11-NEXT: s_lshr_b32 s66, s11, 24 -; GFX11-NEXT: s_lshr_b32 s67, s11, 16 -; GFX11-NEXT: v_writelane_b32 v36, s42, 7 -; GFX11-NEXT: s_lshr_b32 s42, s29, 16 -; GFX11-NEXT: v_writelane_b32 v37, s63, 9 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b32 s68, s11, 8 -; GFX11-NEXT: v_writelane_b32 v36, s42, 8 -; GFX11-NEXT: s_lshr_b32 s59, s10, 16 -; GFX11-NEXT: v_writelane_b32 v37, s62, 6 -; GFX11-NEXT: s_lshr_b32 s58, s10, 8 -; GFX11-NEXT: s_lshr_b32 s70, s13, 24 -; GFX11-NEXT: s_lshr_b32 s71, s13, 16 -; GFX11-NEXT: s_lshr_b32 s60, s13, 8 -; GFX11-NEXT: v_writelane_b32 v37, s63, 7 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 -; GFX11-NEXT: s_lshr_b32 s80, s12, 16 -; GFX11-NEXT: s_lshr_b32 s61, s12, 8 -; GFX11-NEXT: s_lshr_b32 s81, s15, 24 -; GFX11-NEXT: v_writelane_b32 v37, s62, 4 -; GFX11-NEXT: s_lshr_b32 s82, s15, 16 -; GFX11-NEXT: s_lshr_b32 s83, s15, 8 -; GFX11-NEXT: s_lshr_b32 s84, s14, 16 -; GFX11-NEXT: s_lshr_b32 s85, s14, 8 -; GFX11-NEXT: v_writelane_b32 v37, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX11-NEXT: s_lshr_b32 s86, s17, 24 -; GFX11-NEXT: s_lshr_b32 s72, s17, 16 -; GFX11-NEXT: s_lshr_b32 s87, s17, 8 -; GFX11-NEXT: v_writelane_b32 v37, s62, 2 -; GFX11-NEXT: s_lshr_b32 s73, s16, 16 -; GFX11-NEXT: s_lshr_b32 s96, s16, 8 -; GFX11-NEXT: s_lshr_b32 s97, s19, 24 -; GFX11-NEXT: s_lshr_b32 s69, s19, 16 -; GFX11-NEXT: v_writelane_b32 v37, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; GFX11-NEXT: s_lshr_b32 s42, s29, 8 -; GFX11-NEXT: s_lshr_b32 s74, s28, 16 -; GFX11-NEXT: s_lshr_b32 s43, s28, 8 -; GFX11-NEXT: v_writelane_b32 v37, s62, 0 -; GFX11-NEXT: s_lshr_b32 s98, s41, 24 -; GFX11-NEXT: s_lshr_b32 s99, s41, 16 -; GFX11-NEXT: s_lshr_b32 s100, s41, 8 -; GFX11-NEXT: s_lshr_b32 s44, s40, 16 -; GFX11-NEXT: s_lshr_b32 s45, s40, 8 -; GFX11-NEXT: v_writelane_b32 v37, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[18:19], 24 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[22:23], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[26:27], 24 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[28:29], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v36, s42, 9 -; GFX11-NEXT: .LBB13_5: ; %end -; GFX11-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-NEXT: s_add_i32 s15, s15, 3 +; GFX11-NEXT: s_add_i32 s14, s14, 3 +; GFX11-NEXT: v_writelane_b32 v37, s22, 4 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX11-NEXT: v_writelane_b32 v37, s23, 5 +; GFX11-NEXT: s_lshr_b64 s[22:23], s[10:11], 24 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[24:25], 24 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[28:29], 24 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[42:43], 24 +; GFX11-NEXT: v_writelane_b32 v37, s22, 2 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[46:47], 24 +; GFX11-NEXT: s_lshr_b64 s[98:99], s[56:57], 24 +; GFX11-NEXT: s_lshr_b32 s73, s1, 16 +; GFX11-NEXT: s_lshr_b32 s75, s1, 8 +; GFX11-NEXT: v_writelane_b32 v37, s23, 3 +; GFX11-NEXT: s_lshr_b64 s[22:23], s[12:13], 24 +; GFX11-NEXT: s_lshr_b32 s77, s0, 16 +; GFX11-NEXT: s_lshr_b32 s79, s0, 8 +; GFX11-NEXT: s_lshr_b32 s89, s3, 24 +; GFX11-NEXT: v_writelane_b32 v37, s22, 0 +; GFX11-NEXT: s_lshr_b32 s22, s1, 24 +; GFX11-NEXT: s_lshr_b32 s91, s3, 16 +; GFX11-NEXT: s_lshr_b32 s93, s3, 8 +; GFX11-NEXT: s_lshr_b32 s95, s2, 16 +; GFX11-NEXT: v_writelane_b32 v37, s23, 1 +; GFX11-NEXT: s_lshr_b32 s31, s2, 8 +; GFX11-NEXT: s_lshr_b32 s97, s5, 24 +; GFX11-NEXT: s_lshr_b32 s63, s5, 16 +; GFX11-NEXT: s_lshr_b32 s99, s5, 8 +; GFX11-NEXT: v_writelane_b32 v37, s22, 14 +; GFX11-NEXT: s_lshr_b32 s22, s4, 16 +; GFX11-NEXT: s_lshr_b32 s30, s13, 16 +; GFX11-NEXT: s_lshr_b32 s87, s13, 8 +; GFX11-NEXT: s_lshr_b32 s62, s12, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 15 +; GFX11-NEXT: s_lshr_b32 s22, s4, 8 +; GFX11-NEXT: s_lshr_b32 s96, s12, 8 +; GFX11-NEXT: s_lshr_b32 s100, s15, 24 +; GFX11-NEXT: s_lshr_b32 s101, s15, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 16 +; GFX11-NEXT: s_lshr_b32 s22, s7, 24 +; GFX11-NEXT: s_lshr_b32 s102, s15, 8 +; GFX11-NEXT: s_lshr_b32 s103, s14, 16 +; GFX11-NEXT: s_lshr_b32 s104, s14, 8 +; GFX11-NEXT: v_writelane_b32 v37, s22, 17 +; GFX11-NEXT: s_lshr_b32 s22, s7, 16 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s17, 24 +; GFX11-NEXT: s_lshr_b32 s34, s17, 16 +; GFX11-NEXT: s_lshr_b32 s35, s17, 8 +; GFX11-NEXT: v_writelane_b32 v37, s22, 18 +; GFX11-NEXT: s_lshr_b32 s22, s7, 8 +; GFX11-NEXT: s_lshr_b32 s36, s16, 16 +; GFX11-NEXT: s_lshr_b32 s37, s16, 8 +; GFX11-NEXT: s_lshr_b32 s38, s19, 24 +; GFX11-NEXT: v_writelane_b32 v37, s22, 19 +; GFX11-NEXT: s_lshr_b32 s22, s6, 16 +; GFX11-NEXT: s_lshr_b32 s39, s19, 16 +; GFX11-NEXT: s_lshr_b32 s48, s19, 8 +; GFX11-NEXT: s_lshr_b32 s49, s18, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 20 +; GFX11-NEXT: s_lshr_b32 s22, s6, 8 +; GFX11-NEXT: s_lshr_b32 s50, s18, 8 +; GFX11-NEXT: s_lshr_b32 s51, s21, 24 +; GFX11-NEXT: s_lshr_b32 s52, s21, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 21 +; GFX11-NEXT: s_lshr_b32 s22, s9, 24 +; GFX11-NEXT: s_lshr_b32 s53, s21, 8 +; GFX11-NEXT: s_lshr_b32 s54, s20, 16 +; GFX11-NEXT: s_lshr_b32 s55, s20, 8 +; GFX11-NEXT: v_writelane_b32 v37, s22, 22 +; GFX11-NEXT: s_lshr_b32 s22, s9, 16 +; GFX11-NEXT: s_lshr_b32 s64, s25, 24 +; GFX11-NEXT: s_lshr_b32 s65, s25, 16 +; GFX11-NEXT: s_lshr_b32 s23, s24, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 23 +; GFX11-NEXT: s_lshr_b32 s22, s9, 8 +; GFX11-NEXT: s_lshr_b32 s66, s24, 8 +; GFX11-NEXT: s_lshr_b32 s67, s29, 24 +; GFX11-NEXT: s_lshr_b32 s68, s29, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 24 +; GFX11-NEXT: s_lshr_b32 s22, s8, 16 +; GFX11-NEXT: s_lshr_b32 s26, s29, 8 +; GFX11-NEXT: s_lshr_b32 s27, s28, 16 +; GFX11-NEXT: s_lshr_b32 s40, s28, 8 +; GFX11-NEXT: v_writelane_b32 v37, s22, 25 +; GFX11-NEXT: s_lshr_b32 s22, s8, 8 +; GFX11-NEXT: s_lshr_b32 s41, s43, 24 +; GFX11-NEXT: s_lshr_b32 s44, s43, 16 +; GFX11-NEXT: s_lshr_b32 s45, s43, 8 +; GFX11-NEXT: v_writelane_b32 v37, s22, 26 +; GFX11-NEXT: s_lshr_b32 s22, s11, 24 +; GFX11-NEXT: s_lshr_b32 s69, s42, 16 +; GFX11-NEXT: s_lshr_b32 s70, s42, 8 +; GFX11-NEXT: s_lshr_b32 s71, s47, 24 +; GFX11-NEXT: v_writelane_b32 v37, s22, 27 +; GFX11-NEXT: s_lshr_b32 s22, s11, 16 +; GFX11-NEXT: s_lshr_b32 s80, s47, 16 +; GFX11-NEXT: s_lshr_b32 s81, s47, 8 +; GFX11-NEXT: s_lshr_b32 s82, s46, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 28 +; GFX11-NEXT: s_lshr_b32 s22, s11, 8 +; GFX11-NEXT: s_lshr_b32 s83, s46, 8 +; GFX11-NEXT: s_lshr_b32 s84, s57, 24 +; GFX11-NEXT: s_lshr_b32 s58, s57, 16 +; GFX11-NEXT: v_writelane_b32 v37, s22, 29 +; GFX11-NEXT: s_lshr_b32 s22, s10, 16 +; GFX11-NEXT: s_lshr_b32 s59, s57, 8 +; GFX11-NEXT: s_lshr_b32 s85, s56, 16 +; GFX11-NEXT: s_lshr_b32 s86, s56, 8 +; GFX11-NEXT: v_writelane_b32 v37, s22, 30 +; GFX11-NEXT: s_lshr_b32 s22, s10, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v37, s22, 31 +; GFX11-NEXT: s_lshr_b32 s22, s13, 24 +; GFX11-NEXT: v_writelane_b32 v36, s22, 0 +; GFX11-NEXT: s_lshr_b32 s22, s25, 8 +; GFX11-NEXT: .LBB13_3: ; %end +; GFX11-NEXT: s_and_b32 s57, s57, 0xff +; GFX11-NEXT: s_lshl_b32 s59, s59, 8 ; GFX11-NEXT: s_and_b32 s28, s28, 0xff -; GFX11-NEXT: s_and_b32 s42, s74, 0xff -; GFX11-NEXT: s_or_b32 s28, s28, s43 -; GFX11-NEXT: s_lshl_b32 s43, s94, 8 -; GFX11-NEXT: s_and_b32 s28, s28, 0xffff -; GFX11-NEXT: s_or_b32 s42, s42, s43 -; GFX11-NEXT: s_and_b32 s29, s29, 0xff -; GFX11-NEXT: s_lshl_b32 s42, s42, 16 -; GFX11-NEXT: v_readlane_b32 s43, v36, 7 -; GFX11-NEXT: s_or_b32 s28, s28, s42 -; GFX11-NEXT: v_readlane_b32 s42, v36, 9 -; GFX11-NEXT: s_and_b32 s26, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s40, s40, 8 +; GFX11-NEXT: s_or_b32 s57, s57, s59 +; GFX11-NEXT: s_and_b32 s58, s58, 0xff +; GFX11-NEXT: s_lshl_b32 s59, s84, 8 +; GFX11-NEXT: s_or_b32 s28, s28, s40 ; GFX11-NEXT: s_and_b32 s27, s27, 0xff -; GFX11-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-NEXT: s_lshl_b32 s40, s90, 8 +; GFX11-NEXT: s_or_b32 s58, s58, s59 +; GFX11-NEXT: s_or_b32 s27, s27, s40 +; GFX11-NEXT: s_and_b32 s57, s57, 0xffff +; GFX11-NEXT: s_lshl_b32 s58, s58, 16 +; GFX11-NEXT: s_and_b32 s28, s28, 0xffff +; GFX11-NEXT: s_lshl_b32 s27, s27, 16 +; GFX11-NEXT: s_or_b32 s57, s57, s58 +; GFX11-NEXT: s_and_b32 s46, s46, 0xff +; GFX11-NEXT: s_lshl_b32 s58, s83, 8 +; GFX11-NEXT: s_or_b32 s27, s28, s27 +; GFX11-NEXT: s_and_b32 s28, s29, 0xff +; GFX11-NEXT: s_lshl_b32 s26, s26, 8 +; GFX11-NEXT: s_or_b32 s46, s46, s58 +; GFX11-NEXT: s_and_b32 s58, s82, 0xff +; GFX11-NEXT: s_lshl_b32 s59, s94, 8 +; GFX11-NEXT: s_or_b32 s26, s28, s26 +; GFX11-NEXT: s_and_b32 s28, s68, 0xff +; GFX11-NEXT: s_lshl_b32 s29, s67, 8 +; GFX11-NEXT: s_or_b32 s58, s58, s59 +; GFX11-NEXT: s_or_b32 s28, s28, s29 +; GFX11-NEXT: s_and_b32 s46, s46, 0xffff +; GFX11-NEXT: s_lshl_b32 s58, s58, 16 +; GFX11-NEXT: s_and_b32 s26, s26, 0xffff +; GFX11-NEXT: s_lshl_b32 s28, s28, 16 +; GFX11-NEXT: s_or_b32 s46, s46, s58 +; GFX11-NEXT: s_and_b32 s47, s47, 0xff +; GFX11-NEXT: s_lshl_b32 s58, s81, 8 +; GFX11-NEXT: s_or_b32 s26, s26, s28 +; GFX11-NEXT: s_or_b32 s47, s47, s58 +; GFX11-NEXT: s_and_b32 s58, s80, 0xff +; GFX11-NEXT: s_lshl_b32 s59, s71, 8 +; GFX11-NEXT: v_dual_mov_b32 v7, s27 :: v_dual_mov_b32 v8, s26 ; GFX11-NEXT: s_and_b32 s24, s24, 0xff -; GFX11-NEXT: s_lshl_b32 s42, s42, 8 -; GFX11-NEXT: s_and_b32 s25, s25, 0xff -; GFX11-NEXT: s_or_b32 s29, s29, s42 -; GFX11-NEXT: v_readlane_b32 s42, v36, 8 -; GFX11-NEXT: s_and_b32 s29, s29, 0xffff -; GFX11-NEXT: s_and_b32 s22, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s26, s66, 8 +; GFX11-NEXT: s_or_b32 s58, s58, s59 +; GFX11-NEXT: s_or_b32 s24, s24, s26 ; GFX11-NEXT: s_and_b32 s23, s23, 0xff -; GFX11-NEXT: s_and_b32 s20, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s26, s88, 8 +; GFX11-NEXT: s_and_b32 s25, s25, 0xff +; GFX11-NEXT: s_lshl_b32 s22, s22, 8 +; GFX11-NEXT: s_and_b32 s47, s47, 0xffff +; GFX11-NEXT: s_lshl_b32 s58, s58, 16 +; GFX11-NEXT: s_or_b32 s23, s23, s26 +; GFX11-NEXT: s_or_b32 s22, s25, s22 +; GFX11-NEXT: s_and_b32 s25, s65, 0xff +; GFX11-NEXT: s_lshl_b32 s26, s64, 8 +; GFX11-NEXT: s_and_b32 s56, s56, 0xff +; GFX11-NEXT: s_lshl_b32 s60, s86, 8 +; GFX11-NEXT: s_or_b32 s47, s47, s58 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s46 :: v_dual_mov_b32 v4, s47 ; GFX11-NEXT: s_and_b32 s42, s42, 0xff -; GFX11-NEXT: s_and_b32 s21, s21, 0xff -; GFX11-NEXT: s_or_b32 s42, s42, s43 +; GFX11-NEXT: s_lshl_b32 s46, s70, 8 +; GFX11-NEXT: s_or_b32 s25, s25, s26 +; GFX11-NEXT: s_or_b32 s56, s56, s60 +; GFX11-NEXT: s_and_b32 s60, s85, 0xff +; GFX11-NEXT: s_lshl_b32 s61, s98, 8 +; GFX11-NEXT: s_or_b32 s42, s42, s46 +; GFX11-NEXT: s_and_b32 s46, s69, 0xff +; GFX11-NEXT: s_lshl_b32 s47, s92, 8 +; GFX11-NEXT: s_and_b32 s43, s43, 0xff ; GFX11-NEXT: s_lshl_b32 s45, s45, 8 -; GFX11-NEXT: s_lshl_b32 s42, s42, 16 -; GFX11-NEXT: s_and_b32 s40, s40, 0xff -; GFX11-NEXT: s_or_b32 s29, s29, s42 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s28 :: v_dual_mov_b32 v4, s29 -; GFX11-NEXT: v_readlane_b32 s28, v36, 6 -; GFX11-NEXT: v_readlane_b32 s29, v36, 5 -; GFX11-NEXT: s_or_b32 s40, s40, s45 -; GFX11-NEXT: s_lshl_b32 s45, s30, 8 ; GFX11-NEXT: s_and_b32 s44, s44, 0xff -; GFX11-NEXT: s_lshl_b32 s28, s28, 8 -; GFX11-NEXT: s_and_b32 s29, s29, 0xff -; GFX11-NEXT: s_or_b32 s26, s26, s28 -; GFX11-NEXT: s_lshl_b32 s28, s92, 8 -; GFX11-NEXT: s_and_b32 s26, s26, 0xffff -; GFX11-NEXT: s_or_b32 s28, s29, s28 -; GFX11-NEXT: v_readlane_b32 s29, v36, 2 -; GFX11-NEXT: s_lshl_b32 s28, s28, 16 -; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_or_b32 s26, s26, s28 -; GFX11-NEXT: v_readlane_b32 s28, v36, 4 -; GFX11-NEXT: s_lshl_b32 s29, s29, 8 -; GFX11-NEXT: s_and_b32 s18, s18, 0xff -; GFX11-NEXT: s_and_b32 s40, s40, 0xffff -; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_lshl_b32 s28, s28, 8 -; GFX11-NEXT: s_or_b32 s40, s40, s44 -; GFX11-NEXT: s_or_b32 s27, s27, s28 -; GFX11-NEXT: v_readlane_b32 s28, v36, 3 -; GFX11-NEXT: s_and_b32 s27, s27, 0xffff -; GFX11-NEXT: s_and_b32 s41, s41, 0xff -; GFX11-NEXT: s_lshl_b32 s44, s100, 8 -; GFX11-NEXT: s_lshl_b32 s45, s98, 8 -; GFX11-NEXT: s_and_b32 s28, s28, 0xff -; GFX11-NEXT: s_or_b32 s41, s41, s44 -; GFX11-NEXT: s_or_b32 s28, s28, s29 -; GFX11-NEXT: v_readlane_b32 s29, v36, 0 -; GFX11-NEXT: s_lshl_b32 s28, s28, 16 -; GFX11-NEXT: s_and_b32 s44, s99, 0xff -; GFX11-NEXT: s_or_b32 s27, s27, s28 -; GFX11-NEXT: v_readlane_b32 s28, v36, 1 -; GFX11-NEXT: s_and_b32 s29, s29, 0xff -; GFX11-NEXT: v_dual_mov_b32 v5, s26 :: v_dual_mov_b32 v6, s27 -; GFX11-NEXT: v_readlane_b32 s26, v37, 19 -; GFX11-NEXT: s_lshl_b32 s28, s28, 8 -; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_or_b32 s24, s24, s28 -; GFX11-NEXT: s_lshl_b32 s28, s90, 8 +; GFX11-NEXT: s_lshl_b32 s41, s41, 8 ; GFX11-NEXT: s_and_b32 s24, s24, 0xffff -; GFX11-NEXT: s_or_b32 s28, s29, s28 -; GFX11-NEXT: v_readlane_b32 s29, v37, 29 -; GFX11-NEXT: s_lshl_b32 s28, s28, 16 -; GFX11-NEXT: s_lshl_b32 s26, s26, 8 -; GFX11-NEXT: s_or_b32 s24, s24, s28 -; GFX11-NEXT: v_readlane_b32 s28, v37, 31 -; GFX11-NEXT: s_lshl_b32 s29, s29, 8 -; GFX11-NEXT: s_and_b32 s19, s19, 0xff -; GFX11-NEXT: s_and_b32 s41, s41, 0xffff -; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_lshl_b32 s28, s28, 8 -; GFX11-NEXT: s_or_b32 s41, s41, s44 -; GFX11-NEXT: s_or_b32 s25, s25, s28 -; GFX11-NEXT: v_readlane_b32 s28, v37, 30 -; GFX11-NEXT: s_and_b32 s25, s25, 0xffff -; GFX11-NEXT: v_dual_mov_b32 v1, s40 :: v_dual_mov_b32 v2, s41 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: s_and_b32 s28, s28, 0xff -; GFX11-NEXT: s_and_b32 s17, s17, 0xff -; GFX11-NEXT: s_or_b32 s28, s28, s29 -; GFX11-NEXT: s_and_b32 s14, s14, 0xff -; GFX11-NEXT: s_lshl_b32 s28, s28, 16 -; GFX11-NEXT: s_and_b32 s15, s15, 0xff -; GFX11-NEXT: s_or_b32 s25, s25, s28 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v7, s24 :: v_dual_mov_b32 v8, s25 -; GFX11-NEXT: v_readlane_b32 s24, v37, 28 -; GFX11-NEXT: v_readlane_b32 s25, v37, 27 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-NEXT: s_and_b32 s12, s12, 0xff -; GFX11-NEXT: s_lshl_b32 s24, s24, 8 -; GFX11-NEXT: s_and_b32 s25, s25, 0xff -; GFX11-NEXT: s_or_b32 s22, s22, s24 -; GFX11-NEXT: s_lshl_b32 s24, s78, 8 +; GFX11-NEXT: s_lshl_b32 s23, s23, 16 ; GFX11-NEXT: s_and_b32 s22, s22, 0xffff -; GFX11-NEXT: s_or_b32 s24, s25, s24 -; GFX11-NEXT: v_readlane_b32 s25, v37, 24 -; GFX11-NEXT: s_lshl_b32 s24, s24, 16 -; GFX11-NEXT: s_and_b32 s13, s13, 0xff -; GFX11-NEXT: s_or_b32 s22, s22, s24 -; GFX11-NEXT: v_readlane_b32 s24, v37, 26 -; GFX11-NEXT: s_lshl_b32 s25, s25, 8 -; GFX11-NEXT: s_and_b32 s10, s10, 0xff -; GFX11-NEXT: s_and_b32 s11, s11, 0xff -; GFX11-NEXT: s_and_b32 s8, s8, 0xff -; GFX11-NEXT: s_lshl_b32 s24, s24, 8 -; GFX11-NEXT: s_and_b32 s9, s9, 0xff -; GFX11-NEXT: s_or_b32 s23, s23, s24 -; GFX11-NEXT: v_readlane_b32 s24, v37, 25 -; GFX11-NEXT: s_and_b32 s23, s23, 0xffff -; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: s_and_b32 s7, s7, 0xff -; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_and_b32 s24, s24, 0xff -; GFX11-NEXT: s_and_b32 s5, s5, 0xff -; GFX11-NEXT: s_or_b32 s24, s24, s25 -; GFX11-NEXT: v_readlane_b32 s25, v37, 22 -; GFX11-NEXT: s_lshl_b32 s24, s24, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_or_b32 s23, s23, s24 -; GFX11-NEXT: v_readlane_b32 s24, v37, 23 -; GFX11-NEXT: s_and_b32 s25, s25, 0xff -; GFX11-NEXT: v_dual_mov_b32 v9, s22 :: v_dual_mov_b32 v10, s23 -; GFX11-NEXT: s_lshl_b32 s22, s88, 8 -; GFX11-NEXT: s_lshl_b32 s24, s24, 8 -; GFX11-NEXT: s_lshl_b32 s23, s97, 8 +; GFX11-NEXT: s_lshl_b32 s25, s25, 16 +; GFX11-NEXT: s_or_b32 s60, s60, s61 +; GFX11-NEXT: s_or_b32 s46, s46, s47 +; GFX11-NEXT: s_or_b32 s43, s43, s45 +; GFX11-NEXT: s_or_b32 s41, s44, s41 +; GFX11-NEXT: s_or_b32 s23, s24, s23 +; GFX11-NEXT: s_or_b32 s22, s22, s25 +; GFX11-NEXT: s_and_b32 s20, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s24, s55, 8 +; GFX11-NEXT: s_and_b32 s25, s54, 0xff +; GFX11-NEXT: s_lshl_b32 s26, s78, 8 +; GFX11-NEXT: s_and_b32 s56, s56, 0xffff +; GFX11-NEXT: s_lshl_b32 s60, s60, 16 +; GFX11-NEXT: s_and_b32 s42, s42, 0xffff +; GFX11-NEXT: s_lshl_b32 s46, s46, 16 +; GFX11-NEXT: s_and_b32 s43, s43, 0xffff +; GFX11-NEXT: s_lshl_b32 s41, s41, 16 ; GFX11-NEXT: s_or_b32 s20, s20, s24 -; GFX11-NEXT: s_lshl_b32 s24, s62, 8 +; GFX11-NEXT: s_or_b32 s24, s25, s26 +; GFX11-NEXT: s_and_b32 s21, s21, 0xff +; GFX11-NEXT: s_lshl_b32 s25, s53, 8 +; GFX11-NEXT: s_and_b32 s26, s52, 0xff +; GFX11-NEXT: s_lshl_b32 s27, s51, 8 +; GFX11-NEXT: s_or_b32 s56, s56, s60 +; GFX11-NEXT: s_or_b32 s42, s42, s46 +; GFX11-NEXT: s_or_b32 s41, s43, s41 +; GFX11-NEXT: s_or_b32 s21, s21, s25 +; GFX11-NEXT: s_or_b32 s25, s26, s27 +; GFX11-NEXT: v_dual_mov_b32 v1, s56 :: v_dual_mov_b32 v2, s57 +; GFX11-NEXT: v_dual_mov_b32 v5, s42 :: v_dual_mov_b32 v6, s41 ; GFX11-NEXT: s_and_b32 s20, s20, 0xffff -; GFX11-NEXT: s_or_b32 s24, s25, s24 -; GFX11-NEXT: v_readlane_b32 s25, v37, 21 ; GFX11-NEXT: s_lshl_b32 s24, s24, 16 -; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_and_b32 s21, s21, 0xffff +; GFX11-NEXT: s_lshl_b32 s25, s25, 16 ; GFX11-NEXT: s_or_b32 s20, s20, s24 -; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s25, s25, 8 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_or_b32 s21, s21, s25 -; GFX11-NEXT: v_readlane_b32 s25, v37, 20 -; GFX11-NEXT: s_and_b32 s21, s21, 0xffff -; GFX11-NEXT: v_readlane_b32 s100, v35, 4 -; GFX11-NEXT: v_readlane_b32 s99, v35, 3 -; GFX11-NEXT: v_readlane_b32 s98, v35, 2 -; GFX11-NEXT: s_and_b32 s25, s25, 0xff -; GFX11-NEXT: v_readlane_b32 s97, v35, 1 -; GFX11-NEXT: s_or_b32 s25, s25, s26 -; GFX11-NEXT: v_readlane_b32 s31, v34, 1 -; GFX11-NEXT: s_lshl_b32 s24, s25, 16 -; GFX11-NEXT: v_readlane_b32 s30, v34, 0 -; GFX11-NEXT: s_or_b32 s21, s21, s24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v12, s21 -; GFX11-NEXT: v_readlane_b32 s20, v37, 18 -; GFX11-NEXT: v_readlane_b32 s21, v37, 17 -; GFX11-NEXT: s_lshl_b32 s20, s20, 8 -; GFX11-NEXT: s_and_b32 s21, s21, 0xff +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: v_dual_mov_b32 v1, s23 :: v_dual_mov_b32 v2, s22 +; GFX11-NEXT: v_dual_mov_b32 v3, s20 :: v_dual_mov_b32 v4, s21 +; GFX11-NEXT: s_and_b32 s18, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s20, s50, 8 +; GFX11-NEXT: s_and_b32 s21, s49, 0xff +; GFX11-NEXT: s_lshl_b32 s22, s76, 8 ; GFX11-NEXT: s_or_b32 s18, s18, s20 ; GFX11-NEXT: s_or_b32 s20, s21, s22 -; GFX11-NEXT: v_readlane_b32 s21, v37, 16 -; GFX11-NEXT: s_and_b32 s22, s69, 0xff -; GFX11-NEXT: s_and_b32 s18, s18, 0xffff -; GFX11-NEXT: s_lshl_b32 s20, s20, 16 -; GFX11-NEXT: v_readlane_b32 s69, v34, 21 -; GFX11-NEXT: s_lshl_b32 s21, s21, 8 -; GFX11-NEXT: s_or_b32 s18, s18, s20 +; GFX11-NEXT: s_and_b32 s19, s19, 0xff +; GFX11-NEXT: s_lshl_b32 s21, s48, 8 +; GFX11-NEXT: s_and_b32 s22, s39, 0xff +; GFX11-NEXT: s_lshl_b32 s23, s38, 8 ; GFX11-NEXT: s_or_b32 s19, s19, s21 ; GFX11-NEXT: s_or_b32 s21, s22, s23 +; GFX11-NEXT: s_and_b32 s18, s18, 0xffff +; GFX11-NEXT: s_lshl_b32 s20, s20, 16 ; GFX11-NEXT: s_and_b32 s19, s19, 0xffff ; GFX11-NEXT: s_lshl_b32 s21, s21, 16 -; GFX11-NEXT: s_lshl_b32 s20, s96, 8 +; GFX11-NEXT: s_or_b32 s18, s18, s20 ; GFX11-NEXT: s_or_b32 s19, s19, s21 -; GFX11-NEXT: s_and_b32 s21, s73, 0xff -; GFX11-NEXT: s_lshl_b32 s22, s76, 8 +; GFX11-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s20, s37, 8 +; GFX11-NEXT: s_and_b32 s21, s36, 0xff +; GFX11-NEXT: s_lshl_b32 s22, s74, 8 ; GFX11-NEXT: s_or_b32 s16, s16, s20 ; GFX11-NEXT: s_or_b32 s20, s21, s22 -; GFX11-NEXT: s_lshl_b32 s21, s87, 8 -; GFX11-NEXT: s_and_b32 s22, s72, 0xff -; GFX11-NEXT: s_lshl_b32 s23, s86, 8 +; GFX11-NEXT: s_and_b32 s17, s17, 0xff +; GFX11-NEXT: s_lshl_b32 s21, s35, 8 +; GFX11-NEXT: s_and_b32 s22, s34, 0xff +; GFX11-NEXT: s_lshl_b32 s23, vcc_hi, 8 ; GFX11-NEXT: s_or_b32 s17, s17, s21 ; GFX11-NEXT: s_or_b32 s21, s22, s23 -; GFX11-NEXT: v_dual_mov_b32 v1, s18 :: v_dual_mov_b32 v2, s19 -; GFX11-NEXT: v_readlane_b32 s18, v37, 0 ; GFX11-NEXT: s_and_b32 s16, s16, 0xffff ; GFX11-NEXT: s_lshl_b32 s20, s20, 16 ; GFX11-NEXT: s_and_b32 s17, s17, 0xffff ; GFX11-NEXT: s_lshl_b32 s21, s21, 16 ; GFX11-NEXT: s_or_b32 s16, s16, s20 ; GFX11-NEXT: s_or_b32 s17, s17, s21 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s16 :: v_dual_mov_b32 v4, s17 -; GFX11-NEXT: s_lshl_b32 s16, s85, 8 -; GFX11-NEXT: s_and_b32 s17, s84, 0xff -; GFX11-NEXT: s_lshl_b32 s18, s18, 8 -; GFX11-NEXT: v_readlane_b32 s19, v37, 1 +; GFX11-NEXT: v_dual_mov_b32 v5, s18 :: v_dual_mov_b32 v6, s19 +; GFX11-NEXT: v_dual_mov_b32 v7, s16 :: v_dual_mov_b32 v8, s17 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_lshl_b32 s16, s104, 8 +; GFX11-NEXT: s_and_b32 s17, s103, 0xff +; GFX11-NEXT: s_lshl_b32 s18, s72, 8 ; GFX11-NEXT: s_or_b32 s14, s14, s16 ; GFX11-NEXT: s_or_b32 s16, s17, s18 -; GFX11-NEXT: s_lshl_b32 s17, s83, 8 -; GFX11-NEXT: s_and_b32 s18, s82, 0xff -; GFX11-NEXT: s_lshl_b32 s19, s81, 8 +; GFX11-NEXT: s_and_b32 s15, s15, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s102, 8 +; GFX11-NEXT: s_and_b32 s18, s101, 0xff +; GFX11-NEXT: s_lshl_b32 s19, s100, 8 ; GFX11-NEXT: s_or_b32 s15, s15, s17 ; GFX11-NEXT: s_or_b32 s17, s18, s19 -; GFX11-NEXT: v_readlane_b32 s18, v37, 2 +; GFX11-NEXT: v_readlane_b32 s18, v37, 0 +; GFX11-NEXT: v_readlane_b32 s19, v37, 1 +; GFX11-NEXT: v_readlane_b32 s19, v36, 0 ; GFX11-NEXT: s_and_b32 s14, s14, 0xffff ; GFX11-NEXT: s_lshl_b32 s16, s16, 16 ; GFX11-NEXT: s_and_b32 s15, s15, 0xffff ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 ; GFX11-NEXT: s_or_b32 s14, s14, s16 ; GFX11-NEXT: s_or_b32 s15, s15, s17 -; GFX11-NEXT: s_lshl_b32 s16, s61, 8 -; GFX11-NEXT: s_and_b32 s17, s80, 0xff +; GFX11-NEXT: s_and_b32 s12, s12, 0xff +; GFX11-NEXT: s_lshl_b32 s16, s96, 8 +; GFX11-NEXT: s_and_b32 s17, s62, 0xff ; GFX11-NEXT: s_lshl_b32 s18, s18, 8 -; GFX11-NEXT: v_readlane_b32 s19, v37, 3 ; GFX11-NEXT: s_or_b32 s12, s12, s16 ; GFX11-NEXT: s_or_b32 s16, s17, s18 -; GFX11-NEXT: s_lshl_b32 s17, s60, 8 -; GFX11-NEXT: s_and_b32 s18, s71, 0xff -; GFX11-NEXT: s_lshl_b32 s19, s70, 8 +; GFX11-NEXT: s_and_b32 s13, s13, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s87, 8 +; GFX11-NEXT: s_and_b32 s18, s30, 0xff +; GFX11-NEXT: s_lshl_b32 s19, s19, 8 ; GFX11-NEXT: s_or_b32 s13, s13, s17 ; GFX11-NEXT: s_or_b32 s17, s18, s19 -; GFX11-NEXT: v_dual_mov_b32 v5, s14 :: v_dual_mov_b32 v6, s15 -; GFX11-NEXT: v_readlane_b32 s14, v37, 4 ; GFX11-NEXT: s_and_b32 s12, s12, 0xffff ; GFX11-NEXT: s_lshl_b32 s16, s16, 16 ; GFX11-NEXT: s_and_b32 s13, s13, 0xffff ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 ; GFX11-NEXT: s_or_b32 s12, s12, s16 ; GFX11-NEXT: s_or_b32 s13, s13, s17 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v7, s12 :: v_dual_mov_b32 v8, s13 -; GFX11-NEXT: s_lshl_b32 s12, s58, 8 -; GFX11-NEXT: s_and_b32 s13, s59, 0xff +; GFX11-NEXT: v_dual_mov_b32 v9, s14 :: v_dual_mov_b32 v10, s15 +; GFX11-NEXT: v_dual_mov_b32 v11, s12 :: v_dual_mov_b32 v12, s13 +; GFX11-NEXT: v_readlane_b32 s12, v37, 31 +; GFX11-NEXT: v_readlane_b32 s13, v37, 30 +; GFX11-NEXT: v_readlane_b32 s14, v37, 2 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: v_readlane_b32 s15, v37, 3 +; GFX11-NEXT: s_lshl_b32 s12, s12, 8 +; GFX11-NEXT: s_and_b32 s13, s13, 0xff ; GFX11-NEXT: s_lshl_b32 s14, s14, 8 -; GFX11-NEXT: v_readlane_b32 s15, v37, 5 ; GFX11-NEXT: s_or_b32 s10, s10, s12 ; GFX11-NEXT: s_or_b32 s12, s13, s14 -; GFX11-NEXT: s_lshl_b32 s13, s68, 8 -; GFX11-NEXT: s_and_b32 s14, s67, 0xff -; GFX11-NEXT: s_lshl_b32 s15, s66, 8 +; GFX11-NEXT: v_readlane_b32 s13, v37, 29 +; GFX11-NEXT: v_readlane_b32 s14, v37, 28 +; GFX11-NEXT: v_readlane_b32 s15, v37, 27 +; GFX11-NEXT: s_and_b32 s11, s11, 0xff +; GFX11-NEXT: s_and_b32 s10, s10, 0xffff +; GFX11-NEXT: s_lshl_b32 s13, s13, 8 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_lshl_b32 s15, s15, 8 ; GFX11-NEXT: s_or_b32 s11, s11, s13 ; GFX11-NEXT: s_or_b32 s13, s14, s15 -; GFX11-NEXT: v_readlane_b32 s14, v37, 6 -; GFX11-NEXT: s_and_b32 s10, s10, 0xffff ; GFX11-NEXT: s_lshl_b32 s12, s12, 16 ; GFX11-NEXT: s_and_b32 s11, s11, 0xffff ; GFX11-NEXT: s_lshl_b32 s13, s13, 16 ; GFX11-NEXT: s_or_b32 s10, s10, s12 ; GFX11-NEXT: s_or_b32 s11, s11, s13 -; GFX11-NEXT: s_lshl_b32 s12, s65, 8 -; GFX11-NEXT: s_and_b32 s13, s64, 0xff +; GFX11-NEXT: v_readlane_b32 s12, v37, 26 +; GFX11-NEXT: v_readlane_b32 s13, v37, 25 +; GFX11-NEXT: v_readlane_b32 s14, v37, 4 +; GFX11-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-NEXT: v_readlane_b32 s15, v37, 5 +; GFX11-NEXT: s_lshl_b32 s12, s12, 8 +; GFX11-NEXT: s_and_b32 s13, s13, 0xff ; GFX11-NEXT: s_lshl_b32 s14, s14, 8 -; GFX11-NEXT: v_readlane_b32 s15, v37, 7 ; GFX11-NEXT: s_or_b32 s8, s8, s12 ; GFX11-NEXT: s_or_b32 s12, s13, s14 -; GFX11-NEXT: s_lshl_b32 s13, s55, 8 -; GFX11-NEXT: s_and_b32 s14, s54, 0xff -; GFX11-NEXT: s_lshl_b32 s15, s53, 8 +; GFX11-NEXT: v_readlane_b32 s13, v37, 24 +; GFX11-NEXT: v_readlane_b32 s14, v37, 23 +; GFX11-NEXT: v_readlane_b32 s15, v37, 22 +; GFX11-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-NEXT: s_lshl_b32 s13, s13, 8 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_lshl_b32 s15, s15, 8 ; GFX11-NEXT: s_or_b32 s9, s9, s13 ; GFX11-NEXT: s_or_b32 s13, s14, s15 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 -; GFX11-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11 -; GFX11-NEXT: v_readlane_b32 s10, v37, 8 -; GFX11-NEXT: s_and_b32 s8, s8, 0xffff ; GFX11-NEXT: s_lshl_b32 s12, s12, 16 ; GFX11-NEXT: s_and_b32 s9, s9, 0xffff ; GFX11-NEXT: s_lshl_b32 s13, s13, 16 ; GFX11-NEXT: s_or_b32 s8, s8, s12 ; GFX11-NEXT: s_or_b32 s9, s9, s13 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_mov_b32 v12, s9 -; GFX11-NEXT: s_lshl_b32 s8, s52, 8 -; GFX11-NEXT: s_and_b32 s9, s51, 0xff +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-NEXT: v_dual_mov_b32 v1, s10 :: v_dual_mov_b32 v2, s11 +; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: v_readlane_b32 s8, v37, 21 +; GFX11-NEXT: v_readlane_b32 s9, v37, 20 +; GFX11-NEXT: v_readlane_b32 s10, v37, 6 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: v_readlane_b32 s11, v37, 7 +; GFX11-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-NEXT: s_and_b32 s9, s9, 0xff ; GFX11-NEXT: s_lshl_b32 s10, s10, 8 -; GFX11-NEXT: v_readlane_b32 s11, v37, 9 ; GFX11-NEXT: s_or_b32 s6, s6, s8 ; GFX11-NEXT: s_or_b32 s8, s9, s10 -; GFX11-NEXT: s_lshl_b32 s9, s50, 8 -; GFX11-NEXT: s_and_b32 s10, s49, 0xff -; GFX11-NEXT: s_lshl_b32 s11, s48, 8 +; GFX11-NEXT: v_readlane_b32 s9, v37, 19 +; GFX11-NEXT: v_readlane_b32 s10, v37, 18 +; GFX11-NEXT: v_readlane_b32 s11, v37, 17 +; GFX11-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-NEXT: s_and_b32 s6, s6, 0xffff +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s11, 8 ; GFX11-NEXT: s_or_b32 s7, s7, s9 ; GFX11-NEXT: s_or_b32 s9, s10, s11 -; GFX11-NEXT: v_readlane_b32 s10, v37, 10 -; GFX11-NEXT: s_and_b32 s6, s6, 0xffff ; GFX11-NEXT: s_lshl_b32 s8, s8, 16 ; GFX11-NEXT: s_and_b32 s7, s7, 0xffff ; GFX11-NEXT: s_lshl_b32 s9, s9, 16 ; GFX11-NEXT: s_or_b32 s6, s6, s8 ; GFX11-NEXT: s_or_b32 s7, s7, s9 -; GFX11-NEXT: s_lshl_b32 s8, s39, 8 -; GFX11-NEXT: s_and_b32 s9, s38, 0xff +; GFX11-NEXT: v_readlane_b32 s8, v37, 16 +; GFX11-NEXT: v_readlane_b32 s9, v37, 15 +; GFX11-NEXT: v_readlane_b32 s10, v37, 8 +; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: v_readlane_b32 s11, v37, 9 +; GFX11-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-NEXT: s_and_b32 s9, s9, 0xff ; GFX11-NEXT: s_lshl_b32 s10, s10, 8 -; GFX11-NEXT: v_readlane_b32 s11, v37, 11 ; GFX11-NEXT: s_or_b32 s4, s4, s8 ; GFX11-NEXT: s_or_b32 s8, s9, s10 -; GFX11-NEXT: s_lshl_b32 s9, s37, 8 -; GFX11-NEXT: s_and_b32 s10, s36, 0xff -; GFX11-NEXT: s_lshl_b32 s11, s35, 8 +; GFX11-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s99, 8 +; GFX11-NEXT: s_and_b32 s10, s63, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s97, 8 ; GFX11-NEXT: s_or_b32 s5, s5, s9 ; GFX11-NEXT: s_or_b32 s9, s10, s11 -; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 -; GFX11-NEXT: v_readlane_b32 s6, v37, 12 +; GFX11-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s7 +; GFX11-NEXT: v_readlane_b32 s6, v37, 10 ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff ; GFX11-NEXT: s_lshl_b32 s8, s8, 16 ; GFX11-NEXT: s_and_b32 s5, s5, 0xffff @@ -11631,34 +11401,39 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_or_b32 s4, s4, s8 ; GFX11-NEXT: s_or_b32 s5, s5, s9 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 -; GFX11-NEXT: s_lshl_b32 s4, s56, 8 -; GFX11-NEXT: s_and_b32 s5, s57, 0xff +; GFX11-NEXT: v_dual_mov_b32 v7, s4 :: v_dual_mov_b32 v8, s5 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s31, 8 +; GFX11-NEXT: s_and_b32 s5, s95, 0xff ; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: v_readlane_b32 s7, v37, 13 +; GFX11-NEXT: v_readlane_b32 s7, v37, 11 ; GFX11-NEXT: s_or_b32 s2, s2, s4 ; GFX11-NEXT: s_or_b32 s4, s5, s6 -; GFX11-NEXT: s_lshl_b32 s5, s34, 8 -; GFX11-NEXT: s_and_b32 s6, vcc_hi, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s46, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s93, 8 +; GFX11-NEXT: s_and_b32 s6, s91, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s89, 8 ; GFX11-NEXT: s_or_b32 s3, s3, s5 ; GFX11-NEXT: s_or_b32 s5, s6, s7 -; GFX11-NEXT: v_readlane_b32 s6, v37, 14 +; GFX11-NEXT: v_readlane_b32 s6, v37, 12 +; GFX11-NEXT: v_readlane_b32 s7, v37, 13 +; GFX11-NEXT: v_readlane_b32 s7, v37, 14 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s4, s4, 16 ; GFX11-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s4 ; GFX11-NEXT: s_or_b32 s3, s3, s5 -; GFX11-NEXT: s_lshl_b32 s4, s47, 8 -; GFX11-NEXT: s_and_b32 s5, s104, 0xff +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s79, 8 +; GFX11-NEXT: s_and_b32 s5, s77, 0xff ; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: v_readlane_b32 s7, v37, 15 ; GFX11-NEXT: s_or_b32 s0, s0, s4 ; GFX11-NEXT: s_or_b32 s4, s5, s6 -; GFX11-NEXT: s_lshl_b32 s5, s103, 8 -; GFX11-NEXT: s_and_b32 s6, s102, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s101, 8 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s75, 8 +; GFX11-NEXT: s_and_b32 s6, s73, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s5 ; GFX11-NEXT: s_or_b32 s5, s6, s7 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff @@ -11667,17 +11442,21 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s4 ; GFX11-NEXT: s_or_b32 s1, s1, s5 -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 -; GFX11-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 -; GFX11-NEXT: v_dual_mov_b32 v7, s0 :: v_dual_mov_b32 v8, s1 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-NEXT: v_dual_mov_b32 v11, s0 :: v_dual_mov_b32 v12, s1 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:112 ; GFX11-NEXT: v_readlane_b32 s104, v35, 8 ; GFX11-NEXT: v_readlane_b32 s103, v35, 7 ; GFX11-NEXT: v_readlane_b32 s102, v35, 6 ; GFX11-NEXT: v_readlane_b32 s101, v35, 5 +; GFX11-NEXT: v_readlane_b32 s100, v35, 4 +; GFX11-NEXT: v_readlane_b32 s99, v35, 3 +; GFX11-NEXT: v_readlane_b32 s98, v35, 2 +; GFX11-NEXT: v_readlane_b32 s97, v35, 1 ; GFX11-NEXT: v_readlane_b32 s96, v35, 0 ; GFX11-NEXT: v_readlane_b32 s87, v34, 31 ; GFX11-NEXT: v_readlane_b32 s86, v34, 30 @@ -11689,6 +11468,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s80, v34, 24 ; GFX11-NEXT: v_readlane_b32 s71, v34, 23 ; GFX11-NEXT: v_readlane_b32 s70, v34, 22 +; GFX11-NEXT: v_readlane_b32 s69, v34, 21 ; GFX11-NEXT: v_readlane_b32 s68, v34, 20 ; GFX11-NEXT: v_readlane_b32 s67, v34, 19 ; GFX11-NEXT: v_readlane_b32 s66, v34, 18 @@ -11708,6 +11488,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s36, v34, 4 ; GFX11-NEXT: v_readlane_b32 s35, v34, 3 ; GFX11-NEXT: v_readlane_b32 s34, v34, 2 +; GFX11-NEXT: v_readlane_b32 s31, v34, 1 +; GFX11-NEXT: v_readlane_b32 s30, v34, 0 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v34, off, s32 @@ -11717,6 +11499,137 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_4: +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 0 +; GFX11-NEXT: ; implicit-def: $vcc_hi +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 1 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 2 +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 3 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr86 +; GFX11-NEXT: ; implicit-def: $sgpr85 +; GFX11-NEXT: ; implicit-def: $sgpr98 +; GFX11-NEXT: ; implicit-def: $sgpr59 +; GFX11-NEXT: ; implicit-def: $sgpr58 +; GFX11-NEXT: ; implicit-def: $sgpr84 +; GFX11-NEXT: ; implicit-def: $sgpr83 +; GFX11-NEXT: ; implicit-def: $sgpr82 +; GFX11-NEXT: ; implicit-def: $sgpr94 +; GFX11-NEXT: ; implicit-def: $sgpr81 +; GFX11-NEXT: ; implicit-def: $sgpr80 +; GFX11-NEXT: ; implicit-def: $sgpr71 +; GFX11-NEXT: ; implicit-def: $sgpr70 +; GFX11-NEXT: ; implicit-def: $sgpr69 +; GFX11-NEXT: ; implicit-def: $sgpr92 +; GFX11-NEXT: ; implicit-def: $sgpr45 +; GFX11-NEXT: ; implicit-def: $sgpr44 +; GFX11-NEXT: ; implicit-def: $sgpr41 +; GFX11-NEXT: ; implicit-def: $sgpr40 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr90 +; GFX11-NEXT: ; implicit-def: $sgpr26 +; GFX11-NEXT: ; implicit-def: $sgpr68 +; GFX11-NEXT: ; implicit-def: $sgpr67 +; GFX11-NEXT: ; implicit-def: $sgpr66 +; GFX11-NEXT: ; implicit-def: $sgpr23 +; GFX11-NEXT: ; implicit-def: $sgpr88 +; GFX11-NEXT: ; implicit-def: $sgpr22 +; GFX11-NEXT: ; implicit-def: $sgpr65 +; GFX11-NEXT: ; implicit-def: $sgpr64 +; GFX11-NEXT: ; implicit-def: $sgpr55 +; GFX11-NEXT: ; implicit-def: $sgpr54 +; GFX11-NEXT: ; implicit-def: $sgpr78 +; GFX11-NEXT: ; implicit-def: $sgpr53 +; GFX11-NEXT: ; implicit-def: $sgpr52 +; GFX11-NEXT: ; implicit-def: $sgpr51 +; GFX11-NEXT: ; implicit-def: $sgpr50 +; GFX11-NEXT: ; implicit-def: $sgpr49 +; GFX11-NEXT: ; implicit-def: $sgpr76 +; GFX11-NEXT: ; implicit-def: $sgpr48 +; GFX11-NEXT: ; implicit-def: $sgpr39 +; GFX11-NEXT: ; implicit-def: $sgpr38 +; GFX11-NEXT: ; implicit-def: $sgpr37 +; GFX11-NEXT: ; implicit-def: $sgpr36 +; GFX11-NEXT: ; implicit-def: $sgpr74 +; GFX11-NEXT: ; implicit-def: $sgpr35 +; GFX11-NEXT: ; implicit-def: $sgpr34 +; GFX11-NEXT: ; implicit-def: $sgpr104 +; GFX11-NEXT: ; implicit-def: $sgpr103 +; GFX11-NEXT: ; implicit-def: $sgpr72 +; GFX11-NEXT: ; implicit-def: $sgpr102 +; GFX11-NEXT: ; implicit-def: $sgpr101 +; GFX11-NEXT: ; implicit-def: $sgpr100 +; GFX11-NEXT: ; implicit-def: $sgpr96 +; GFX11-NEXT: ; implicit-def: $sgpr62 +; GFX11-NEXT: ; implicit-def: $sgpr87 +; GFX11-NEXT: ; implicit-def: $sgpr30 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr99 +; GFX11-NEXT: ; implicit-def: $sgpr63 +; GFX11-NEXT: ; implicit-def: $sgpr97 +; GFX11-NEXT: ; implicit-def: $sgpr31 +; GFX11-NEXT: ; implicit-def: $sgpr95 +; GFX11-NEXT: ; implicit-def: $sgpr93 +; GFX11-NEXT: ; implicit-def: $sgpr91 +; GFX11-NEXT: ; implicit-def: $sgpr89 +; GFX11-NEXT: ; implicit-def: $sgpr79 +; GFX11-NEXT: ; implicit-def: $sgpr77 +; GFX11-NEXT: ; implicit-def: $sgpr75 +; GFX11-NEXT: ; implicit-def: $sgpr73 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; kill: killed $sgpr61 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 4 +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 6 +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 8 +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 10 +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 12 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13 +; GFX11-NEXT: s_branch .LBB13_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -11754,22 +11667,22 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 @@ -11788,33 +11701,33 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:152 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:160 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:184 -; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v25 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -11871,19 +11784,19 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v32 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 @@ -11891,19 +11804,19 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -11914,7 +11827,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 @@ -11922,27 +11835,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 @@ -11951,15 +11864,15 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 @@ -11973,24 +11886,24 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 @@ -11999,29 +11912,26 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:268 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 @@ -12030,29 +11940,29 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 @@ -12061,29 +11971,29 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 @@ -12092,239 +12002,307 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: v_or_b32_e32 v3, v40, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v4, v45, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v5, v41, v5 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v61, v8 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v55, v8 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v36 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v43 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v63 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v6, v56 +; SI-NEXT: v_or_b32_e32 v6, v6, v34 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v7, v46 +; SI-NEXT: v_or_b32_e32 v7, v7, v38 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v35 -; SI-NEXT: v_or_b32_e32 v8, v8, v44 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v8, v8, v55 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v46 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v51 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v63 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v44 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v42 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v35 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v51 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v45 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -12342,204 +12320,299 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v59 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 ; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 ; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; SI-NEXT: v_or_b32_e32 v31, v31, v43 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v32, v33, v32 ; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -12551,351 +12624,189 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: .LBB14_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v45, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v5, v41, v5 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v61, v8 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v55, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v2, v52, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v4, v43, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_or_b32_e32 v5, v63, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v56, v6 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v46, v7 +; SI-NEXT: v_or_b32_e32 v7, v38, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v8, v44, v8 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 @@ -12903,15 +12814,14 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 @@ -12919,12 +12829,12 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 @@ -12932,12 +12842,12 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 @@ -12945,12 +12855,12 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 @@ -12958,12 +12868,14 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 @@ -12995,7 +12907,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13004,7 +12916,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -13012,7 +12924,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13021,7 +12933,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -13029,7 +12941,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13038,7 +12950,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v18 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -13046,7 +12958,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13055,7 +12967,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v19 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -13063,7 +12975,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13072,7 +12984,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -13080,7 +12992,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13089,7 +13001,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -13097,7 +13009,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13106,15 +13018,15 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13123,32 +13035,30 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13157,15 +13067,15 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13174,15 +13084,15 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13191,15 +13101,15 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13208,15 +13118,15 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13225,15 +13135,15 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -13242,19 +13152,21 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_or_b32_e32 v31, v43, v31 ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v32, v33, v32 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 @@ -13331,16 +13243,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v3 ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v9 @@ -13376,43 +13288,42 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v41 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v43 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill @@ -13422,13 +13333,13 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 @@ -13438,11 +13349,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 @@ -13451,7 +13362,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill @@ -13461,7 +13372,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill @@ -13475,19 +13386,19 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill @@ -13501,25 +13412,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -13527,25 +13438,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -13553,25 +13464,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -13579,15 +13490,15 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 @@ -13596,88 +13507,88 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v14, v44, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v62, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: v_or_b32_sdwa v15, v41, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -13701,84 +13612,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr61 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -13792,41 +13635,41 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v43, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -13844,11 +13687,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -13861,17 +13704,17 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -13880,289 +13723,359 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: .LBB14_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB14_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v31, 0x300 +; VI-NEXT: v_add_u16_e32 v9, 3, v62 ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v2, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 -; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v4, v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v5, v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 ; VI-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 -; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -14194,11 +14107,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v62 +; VI-NEXT: v_add_u16_e32 v8, 3, v63 ; VI-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 ; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v32 +; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v10, 3, v60 ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -14207,9 +14120,8 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v10, v10, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v9, v9, v10 ; VI-NEXT: v_add_u16_e32 v10, 3, v58 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v11, 3, v56 ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -14231,7 +14143,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v13, 3, v46 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 ; VI-NEXT: v_add_u16_sdwa v13, v13, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v12, v12, v13 @@ -14239,39 +14151,38 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 3, v44 -; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 ; VI-NEXT: v_add_u16_sdwa v14, v14, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v42 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v14, 3, v43 ; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v15, 3, v40 +; VI-NEXT: v_add_u16_e32 v15, 3, v41 ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_add_u16_sdwa v15, v15, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v14, v14, v15 ; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v15, 3, v15 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 @@ -14279,12 +14190,12 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v16, v17 ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v18, 3, v18 @@ -14317,7 +14228,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 @@ -14336,14 +14247,14 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v21, 3, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v21, 0x300, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v22, v22, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 @@ -14370,46 +14281,46 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v24, v24, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v23, v24 ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v24, 3, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 ; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v26, 3, v26 ; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v26, v26, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v25, v26 -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v26, 3, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 ; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v26, v26, v27 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -14421,21 +14332,21 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v28, v28, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v27, v27, v28 -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v28, 3, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v29, v29, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v28, v28, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -14448,7 +14359,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v29, v29, v30 ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v30, 3, v30 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -14460,7 +14371,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v32, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v30, v30, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v32, 3, v32 @@ -14470,7 +14381,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v33, 3, v33 -; VI-NEXT: v_or_b32_sdwa v33, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 ; VI-NEXT: .LBB14_4: ; %end @@ -14547,16 +14458,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 -; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v9 @@ -14602,47 +14513,45 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v32 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v43 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v42 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill @@ -14652,13 +14561,13 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: s_nop 0 @@ -14669,11 +14578,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 ; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 @@ -14682,7 +14591,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill @@ -14694,7 +14603,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill @@ -14709,19 +14618,19 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill @@ -14736,25 +14645,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -14763,25 +14672,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -14790,25 +14699,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -14817,106 +14726,105 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v62, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_or_b32_sdwa v15, v41, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -14940,84 +14848,16 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -15031,41 +14871,41 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v43, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -15083,11 +14923,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -15100,17 +14940,17 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -15119,296 +14959,363 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: .LBB14_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v62 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(27) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(27) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 ; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -15440,11 +15347,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v62 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 ; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v32 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v10, 3, v60 ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -15453,7 +15360,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v11, 3, v56 @@ -15476,7 +15383,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v13, 3, v46 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 ; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 @@ -15484,39 +15391,38 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v14, 3, v43 ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v41 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 @@ -15524,12 +15430,12 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 @@ -15562,7 +15468,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 @@ -15581,14 +15487,14 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 ; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 @@ -15615,46 +15521,46 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 ; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v25 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 ; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -15666,21 +15572,21 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -15693,7 +15599,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -15705,7 +15611,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 @@ -15715,7 +15621,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v32, v63, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 ; GFX9-NEXT: .LBB14_4: ; %end @@ -17532,254 +17438,244 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:152 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v25 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v21 -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v17 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v25 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v45 -; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v44 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v43 -; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v42 -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v54 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v0 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -17787,920 +17683,933 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB15_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v59, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 ; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v2, v2, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v61 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: v_mov_b32_e32 v2, v9 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v47, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v29, v1 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v0, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 -; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v43 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v32 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mov_b32_e32 v50, v16 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v16, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: v_mov_b32_e32 v48, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v17, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v18, v1 ; SI-NEXT: v_or_b32_e32 v18, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v19, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 -; SI-NEXT: v_mov_b32_e32 v54, v23 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v20, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v3 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v21, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v22, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v45, v24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v34, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v23, v1 -; SI-NEXT: v_or_b32_e32 v23, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v61 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v24, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v25, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 -; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v26, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v37, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v52, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_or_b32_e32 v27, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_mov_b32_e32 v34, v41 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v46, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v28, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_or_b32_e32 v29, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v30, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v31, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s20, 0xff -; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s22, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s24, 0xff -; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s26, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_branch .LBB15_3 ; SI-NEXT: .LBB15_2: -; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v43 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v34, v41 +; SI-NEXT: v_mov_b32_e32 v33, v60 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB15_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mov_b32_e32 v35, v57 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) expcnt(3) +; SI-NEXT: v_mov_b32_e32 v59, v62 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB15_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_and_b32 s8, s26, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v0, s4, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, s7, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v6, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v7, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v8, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v9, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v10, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v10 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v11, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v11 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v12, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v12 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v13, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v13 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v14, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v50, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v16, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v59, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v17, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v18, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v18 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v19, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v2 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v19 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v57, v4 +; SI-NEXT: v_or_b32_e32 v20, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v21, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v46, v1 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -18708,14 +18617,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -18723,20 +18632,19 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v0 @@ -18782,115 +18690,114 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v23 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 @@ -18901,26 +18808,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v11 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v7 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -18930,807 +18833,814 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:116 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB15_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff ; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_and_b32 s4, s20, 0xff +; VI-NEXT: s_lshl_b32 s5, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff ; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s24, 0xff +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s8, s4, s5 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v47, v1 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v46, v0 -; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v62, v0 -; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v63, v1 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v1 +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_mov_b32_e32 v60, v0 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v35, v0 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v44, v0 +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v34, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v59, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v63, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v51 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v61 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v55, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v37, v53 ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v54, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v53, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v41, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v33 +; VI-NEXT: v_mov_b32_e32 v38, v54 +; VI-NEXT: v_or_b32_sdwa v0, v54, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v48, v63 +; VI-NEXT: v_mov_b32_e32 v46, v33 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v51, v34 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v61, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v56 -; VI-NEXT: v_or_b32_sdwa v0, v56, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v38, v39 -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v53 -; VI-NEXT: v_mov_b32_e32 v52, v36 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v0, v36, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v60 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v1, v33, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v34, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v36, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v62, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v59, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v50, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v58, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v49, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v51, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v57, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v56, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v48, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v39, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v50, v40 -; VI-NEXT: v_mov_b32_e32 v49, v51 -; VI-NEXT: v_mov_b32_e32 v40, v34 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v36, v62 +; VI-NEXT: v_mov_b32_e32 v59, v58 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v62, v32 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 -; VI-NEXT: s_and_b32 s4, s16, 0xff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff -; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_branch .LBB15_3 ; VI-NEXT: .LBB15_2: -; VI-NEXT: v_mov_b32_e32 v44, v56 -; VI-NEXT: v_mov_b32_e32 v41, v33 -; VI-NEXT: v_mov_b32_e32 v50, v40 -; VI-NEXT: v_mov_b32_e32 v38, v39 -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v54, v53 -; VI-NEXT: v_mov_b32_e32 v52, v36 -; VI-NEXT: v_mov_b32_e32 v49, v51 +; VI-NEXT: v_mov_b32_e32 v48, v63 +; VI-NEXT: v_mov_b32_e32 v37, v53 +; VI-NEXT: v_mov_b32_e32 v35, v51 +; VI-NEXT: v_mov_b32_e32 v38, v54 +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v36, v62 +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v46, v33 +; VI-NEXT: v_mov_b32_e32 v50, v32 +; VI-NEXT: v_mov_b32_e32 v59, v58 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB15_3: ; %Flow -; VI-NEXT: v_mov_b32_e32 v51, v41 -; VI-NEXT: v_mov_b32_e32 v36, v44 -; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v54, v60 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v52, v59 +; VI-NEXT: v_mov_b32_e32 v58, v36 +; VI-NEXT: v_mov_b32_e32 v59, v38 ; VI-NEXT: s_cbranch_vccnz .LBB15_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_mov_b32_e32 v33, v35 +; VI-NEXT: v_mov_b32_e32 v35, v37 +; VI-NEXT: v_mov_b32_e32 v37, v48 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v0, s7, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v16 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v17 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v41, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v19, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v2 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v21, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v21 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v46 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v59 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v61 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v58 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v56 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v47 +; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v45 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -19775,128 +19685,130 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:80 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:144 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:176 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v41 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v55 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v45 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v20 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v28 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v30 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 @@ -19906,7 +19818,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; GFX9-NEXT: v_lshlrev_b32_e32 v41, 8, v41 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 @@ -19918,16 +19829,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v7 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -19938,423 +19848,410 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v13 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v5 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 -; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v9 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 -; GFX9-NEXT: s_waitcnt vmcnt(29) -; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:252 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s29, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s21, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s7, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s8, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v59, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_mov_b32_e32 v61, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_mov_b32_e32 v37, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v47, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v61, v52 +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v35, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v35, v62 -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v40, v30 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v36, v31 -; GFX9-NEXT: v_mov_b32_e32 v45, v62 -; GFX9-NEXT: v_mov_b32_e32 v46, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v53 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v46, v42 +; GFX9-NEXT: v_mov_b32_e32 v53, v51 +; GFX9-NEXT: v_mov_b32_e32 v50, v33 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 +; GFX9-NEXT: v_mov_b32_e32 v42, v36 +; GFX9-NEXT: v_mov_b32_e32 v51, v32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 -; GFX9-NEXT: s_and_b32 s4, s16, 0xff -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s18, 0xff -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s22, 0xff -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_branch .LBB15_3 ; GFX9-NEXT: .LBB15_2: -; GFX9-NEXT: v_mov_b32_e32 v33, v43 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v35, v62 -; GFX9-NEXT: v_mov_b32_e32 v36, v31 -; GFX9-NEXT: v_mov_b32_e32 v40, v30 +; GFX9-NEXT: v_mov_b32_e32 v61, v52 +; GFX9-NEXT: v_mov_b32_e32 v46, v42 +; GFX9-NEXT: v_mov_b32_e32 v53, v51 +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v50, v33 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB15_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v62, v35 -; GFX9-NEXT: v_mov_b32_e32 v35, v38 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -20398,160 +20295,163 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_and_b32 s8, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s29, 8 ; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 ; GFX9-NEXT: s_movk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, v53 +; GFX9-NEXT: v_mov_b32_e32 v54, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v50 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 @@ -20559,40 +20459,41 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 @@ -20600,153 +20501,159 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 3, v53 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v46 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v62 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v58 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v57 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v56 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -20992,309 +20899,241 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v91 -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v49 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 -; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v78 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v79 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v35 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v88 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v63 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v72 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v62 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v73 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v74 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v75 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v47 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v45 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v58 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v59 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v182 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v56 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v40 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v40 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v41 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v165 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v43 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v44 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v163 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v167 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v16, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v148 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v177 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v146 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v178 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v179 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v18, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v147 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v118 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v150 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v103 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v151 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v160 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v20, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v112 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v101 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v132 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v112 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v132 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v133 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v135 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v84 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v102 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v144 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v144 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v119 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v128 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v25, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v130 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v24, v131 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v84 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v130 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v131 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v80 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v114 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v116 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v27, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v94, 0xff, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v81 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v116 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v69 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v2, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v4, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v94, v89 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v99 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v30, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v65 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff -; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 -; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v2, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 @@ -21933,309 +21772,241 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91 -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v49 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 -; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v6, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v78 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v35 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v8, v63 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v33 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v62 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v10, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v47 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v45 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v12, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v182 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v13, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v180 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v14, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v165 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v15, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v163 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v16, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v148 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v177 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v146 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v18, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v118 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v103 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v20, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v101 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v21, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v86 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v22, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v23, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v81 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v24, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v25, v113 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v24, v131 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v2, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v94, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v2, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v4, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v94, v89 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v99 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v30, v31 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v2, v3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3 @@ -24888,18 +24659,18 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -27451,650 +27222,655 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: v_mov_b32_e32 v43, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v58, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v30, v28 +; SI-NEXT: v_mov_b32_e32 v28, v27 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v41, v23 -; SI-NEXT: v_mov_b32_e32 v29, v20 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v44, v20 +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v59, v29 +; SI-NEXT: v_mov_b32_e32 v60, v24 +; SI-NEXT: v_mov_b32_e32 v41, v22 +; SI-NEXT: v_mov_b32_e32 v22, v13 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v30 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s28 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v37 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v57 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 ; SI-NEXT: s_cbranch_scc0 .LBB19_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_mov_b32_e32 v47, v3 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v0, v19 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v23 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 -; SI-NEXT: v_mov_b32_e32 v1, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 -; SI-NEXT: v_mov_b32_e32 v2, v14 -; SI-NEXT: v_mov_b32_e32 v49, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v3, v16 -; SI-NEXT: v_mov_b32_e32 v20, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v63 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v4, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[3:4], v[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v5, v29 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[4:5], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v5, v9 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 -; SI-NEXT: v_mov_b32_e32 v6, v45 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[5:6], v[9:10], 16 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v57 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 -; SI-NEXT: v_mov_b32_e32 v7, v39 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[6:7], v[8:9], 16 +; SI-NEXT: v_mov_b32_e32 v7, v40 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 -; SI-NEXT: v_mov_b32_e32 v8, v9 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[7:8], v[40:41], 16 +; SI-NEXT: v_mov_b32_e32 v8, v24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_lshr_b64 v[8:9], v[24:25], 16 ; SI-NEXT: v_mov_b32_e32 v9, v54 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_mov_b32_e32 v10, v11 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: v_mov_b32_e32 v10, v53 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 -; SI-NEXT: v_mov_b32_e32 v13, v58 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[53:54], 16 +; SI-NEXT: v_mov_b32_e32 v11, v52 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 -; SI-NEXT: v_mov_b32_e32 v14, v60 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[11:12], v[52:53], 16 +; SI-NEXT: v_mov_b32_e32 v12, v51 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 -; SI-NEXT: v_mov_b32_e32 v15, v62 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[12:13], v[51:52], 16 +; SI-NEXT: v_mov_b32_e32 v13, v39 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 -; SI-NEXT: v_mov_b32_e32 v16, v32 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v14, v32 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; SI-NEXT: v_mov_b32_e32 v40, v17 -; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_mov_b32_e32 v21, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 -; SI-NEXT: v_mov_b32_e32 v22, v31 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 -; SI-NEXT: v_mov_b32_e32 v23, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 -; SI-NEXT: v_mov_b32_e32 v24, v41 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[14:15], v[32:33], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v33, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_mov_b32_e32 v53, v58 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 -; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_lshr_b64 v[16:17], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v17, v62 +; SI-NEXT: v_mov_b32_e32 v19, v63 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[17:18], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v18, v44 +; SI-NEXT: v_mov_b32_e32 v63, v19 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[18:19], v[44:45], 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_mov_b32_e32 v19, v61 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[61:62], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v56 +; SI-NEXT: v_mov_b32_e32 v56, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 +; SI-NEXT: v_mov_b32_e32 v20, v60 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[60:61], 16 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v42 +; SI-NEXT: v_mov_b32_e32 v42, v59 +; SI-NEXT: v_lshr_b64 v[21:22], v[59:60], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v47 +; SI-NEXT: v_lshr_b64 v[22:23], v[58:59], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v58, v36 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v43 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_mov_b32_e32 v23, v31 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v32, v35 ; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 -; SI-NEXT: v_mov_b32_e32 v26, v43 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v46 +; SI-NEXT: v_lshr_b64 v[26:27], v[34:35], 16 +; SI-NEXT: v_mov_b32_e32 v27, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 ; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v57 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: v_lshr_b64 v[28:29], v[50:51], 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_mov_b32_e32 v29, v30 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 -; SI-NEXT: v_mov_b32_e32 v53, v31 -; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[30:31], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v49 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshr_b64 v[34:35], v[49:50], 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[38:39], 16 +; SI-NEXT: v_mov_b32_e32 v31, v34 ; SI-NEXT: s_branch .LBB19_3 ; SI-NEXT: .LBB19_2: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: v_mov_b32_e32 v42, v59 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v53, v58 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: v_mov_b32_e32 v47, v34 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v50 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_mov_b32_e32 v32, v35 +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v33, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v60, v3 +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB19_3: ; %Flow -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mov_b32_e32 v32, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v33, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v53 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v47 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v54, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v36, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v61 ; SI-NEXT: s_cbranch_vccnz .LBB19_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 @@ -28103,83 +27879,72 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 ; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB19_5: ; %end @@ -28209,36 +27974,36 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 ; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 ; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 ; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill @@ -29588,8 +29353,8 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v173, v0 :: v_dual_mov_b32 v174, s29 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 @@ -29612,769 +29377,655 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000 ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 ; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s25, 16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v9, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v1.l -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v1.l +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 ; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 ; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 -; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v167 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v177 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v167 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v1, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v7, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v167 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v176 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v176 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v5, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v177 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v178 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v179 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v6 :: v_dual_add_f32 v3, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v177 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v178 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v178 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v181 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v180 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v4, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v179 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v180 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v180 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v183 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v3, v5, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v182 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v3, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v181 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v182 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v182 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v169 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v5, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v168 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v1.l ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v183 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v168 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v168 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v171 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v3, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v170 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v6 :: v_dual_add_f32 v3, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v168.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v169 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v170 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v170 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v171 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v172 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v170 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v169 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v6 :: v_dual_add_nc_u32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v5, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v171 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v173 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v173 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v4, v6 :: v_dual_add_nc_u32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v6 :: v_dual_lshlrev_b32 v6, 16, v173 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v174 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_add_f32 v5, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v174 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v9, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v172 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v172 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_add_f32 v4, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v173, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v173.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v174 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v174 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v2, v7 :: v_dual_add_nc_u32 v7, v8, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v1.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v3.l ; GFX11-TRUE16-NEXT: .LBB19_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v19, v171 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v172 :: v_dual_mov_b32 v17, v174 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v23, v183 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 ; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 @@ -30483,101 +30134,174 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:184 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:56 +; GFX11-FAKE16-NEXT: s_clause 0xd ; 56-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v185, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v188, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v189, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v190, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v191, s32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v190, v13 :: v_dual_mov_b32 v191, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v67, v9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v179, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, v7 :: v_dual_mov_b32 v183, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v189, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v4 :: v_dual_mov_b32 v185, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v1 :: v_dual_mov_b32 v69, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v0 :: v_dual_mov_b32 v181, s29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v107, s16 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v34, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v140, s2 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:1080 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:1096 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:1112 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1128 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1144 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1160 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1176 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1192 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v144, s3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v114, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s17 :: v_dual_mov_b32 v159, s26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v42, s19 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:952 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:968 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:984 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1000 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1016 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1032 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1048 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1064 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:824 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:840 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:856 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:872 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:888 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:904 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:920 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:936 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s23 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:696 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:712 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:728 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:744 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:760 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:776 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:792 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:808 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, s24 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:632 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:648 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:664 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:680 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s25 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:552 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s27 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:424 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB19_3 ; GFX11-FAKE16-NEXT: .LBB19_2: ; %cmp.true @@ -30585,762 +30309,937 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s24, 16 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v9 :: v_dual_add_nc_u32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:424 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s23, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v159, v0, 16, v1 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s22, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:552 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s21, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:632 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:648 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:664 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:680 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s20, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:696 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:712 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:728 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:744 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:760 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:776 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:792 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:808 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s19, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:824 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:840 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:856 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:872 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:888 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:904 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:920 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:936 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s18, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:952 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:968 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:984 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1000 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1016 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1032 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1048 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1064 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:1080 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:1096 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:1112 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1128 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1144 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1160 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1176 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1192 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v69 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v42, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v114, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 ; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v107, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 ; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v144, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v140, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v4, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v190 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v190 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v1, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_add_nc_u32 v3, v5, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v191 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v7, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v191 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v190, v1, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v3, v5, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v68 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v191, v2, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v7, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v67 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v5, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v179 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v7, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v179 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v68, v1, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v9, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v70 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v8 :: v_dual_and_b32 v6, 0xffff0000, v70 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v2, 16, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v188 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v188 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v6, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v7, v11, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_add_f32 v5, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v183 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v3, v3, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v70, v1, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v180 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v189 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v188, v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v7, 16, v180 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_add_nc_u32 v6, v7, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v189 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_add_nc_u32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v12, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v185 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v10 :: v_dual_add_nc_u32 v5, v11, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v185 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v10 :: v_dual_cndmask_b32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v2, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v10, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v184 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v189, v5, 16, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v182 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v6, v11 :: v_dual_lshlrev_b32 v10, 16, v69 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v181 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v13 :: v_dual_add_nc_u32 v11, v12, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v14, v12 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v16 :: v_dual_add_nc_u32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v13, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v185, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v3, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v11, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v5, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v14, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v7, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v69, v9, 16, v10 ; GFX11-FAKE16-NEXT: .LBB19_3: ; %end -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v34 :: v_dual_mov_b32 v2, v140 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v42 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:1080 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:1096 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:1112 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:1128 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:1144 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:1160 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:1176 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:1192 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v144 :: v_dual_mov_b32 v4, v107 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v76 :: v_dual_mov_b32 v6, v114 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, v184 :: v_dual_mov_b32 v20, v185 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v180 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v183 :: v_dual_mov_b32 v24, v188 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, v32 :: v_dual_mov_b32 v30, v191 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, v190 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, v181 :: v_dual_mov_b32 v18, v182 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v70 :: v_dual_mov_b32 v26, v179 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v67 :: v_dual_mov_b32 v28, v68 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v14, v159 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v69 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v41 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:952 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:968 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:984 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:1000 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:1016 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:1032 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:1048 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:1064 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v42 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:824 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:840 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:856 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:872 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:888 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:904 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:920 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:936 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v43 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:696 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:712 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:728 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:744 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:760 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:776 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:792 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:808 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, v44 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:632 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:648 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:664 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:680 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v45 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:552 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, v46 +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 224-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v191, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v188, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v185, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:92 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:220 +; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:308 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, v48 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB19_4: -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 ; GFX11-FAKE16-NEXT: s_branch .LBB19_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -31381,15 +31280,15 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 @@ -31402,213 +31301,199 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v4 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -31620,28 +31505,27 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v62 ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -31658,89 +31542,99 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: .LBB20_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v62 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v46 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v62 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 ; SI-NEXT: v_mov_b32_e32 v34, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v46 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v19 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 @@ -31748,37 +31642,42 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 @@ -31801,10 +31700,11 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 @@ -31813,37 +31713,37 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v2 -; SI-NEXT: v_mov_b32_e32 v50, v28 ; SI-NEXT: v_mov_b32_e32 v48, v29 ; SI-NEXT: v_mov_b32_e32 v38, v30 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v56, v8 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v31 +; SI-NEXT: v_mov_b32_e32 v46, v28 +; SI-NEXT: v_mov_b32_e32 v63, v8 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 @@ -31854,27 +31754,25 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 @@ -31883,7 +31781,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 @@ -31892,7 +31790,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 @@ -31901,7 +31799,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -31910,7 +31808,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -31919,7 +31817,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -31928,7 +31826,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -31937,7 +31835,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -31946,7 +31844,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -31955,7 +31853,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -31964,7 +31862,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -31974,7 +31872,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -31985,8 +31883,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -31996,8 +31894,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -32007,8 +31905,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -32018,8 +31916,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -32029,8 +31927,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -32040,8 +31938,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -32051,8 +31949,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -32062,7 +31960,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -32073,8 +31971,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -32083,53 +31981,55 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -32459,9 +32359,9 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: s_lshr_b32 s4, s44, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 ; SI-NEXT: s_lshr_b32 s4, s43, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 ; SI-NEXT: s_lshr_b32 s4, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 @@ -32492,7 +32392,7 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v42, s47 ; SI-NEXT: v_cvt_f32_f16_e32 v44, s46 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s44 ; SI-NEXT: v_cvt_f32_f16_e32 v60, s43 ; SI-NEXT: v_cvt_f32_f16_e32 v62, s42 ; SI-NEXT: s_waitcnt expcnt(0) @@ -32595,7 +32495,7 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v42, s47 ; SI-NEXT: v_cvt_f32_f16_e32 v44, s46 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s44 ; SI-NEXT: v_cvt_f32_f16_e32 v60, s43 ; SI-NEXT: v_cvt_f32_f16_e32 v62, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s41 @@ -32628,8 +32528,8 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v43, s60 ; SI-NEXT: v_cvt_f32_f16_e32 v45, s59 ; SI-NEXT: v_cvt_f32_f16_e32 v47, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s56 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 ; SI-NEXT: .LBB21_3: ; %end @@ -32651,22 +32551,22 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v57, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v56, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v56, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 ; SI-NEXT: v_add_i32_e32 v47, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -32893,10 +32793,10 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 @@ -33635,218 +33535,210 @@ define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB22_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -33854,27 +33746,37 @@ define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload @@ -34227,22 +34129,23 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v28 ; SI-NEXT: v_mov_b32_e32 v53, v26 -; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v41, v6 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) @@ -34254,40 +34157,40 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 ; SI-NEXT: v_mov_b32_e32 v54, v14 ; SI-NEXT: v_mov_b32_e32 v55, v12 -; SI-NEXT: v_mov_b32_e32 v41, v11 -; SI-NEXT: v_mov_b32_e32 v40, v10 -; SI-NEXT: v_mov_b32_e32 v44, v9 -; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_mov_b32_e32 v44, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 @@ -34298,25 +34201,25 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v48 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v42 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -34328,68 +34231,77 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB23_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_or_b32_e32 v19, v54, v19 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_mov_b32_e32 v34, v53 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_mov_b32_e32 v53, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: v_mov_b32_e32 v52, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v51, v22 ; SI-NEXT: v_mov_b32_e32 v51, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 @@ -34411,82 +34323,76 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v27, v38, v27 ; SI-NEXT: v_mov_b32_e32 v38, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v28, v37, v28 ; SI-NEXT: v_mov_b32_e32 v37, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v8, v11, v8 ; SI-NEXT: v_or_b32_e32 v9, v14, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_or_b32_e32 v19, v54, v19 -; SI-NEXT: v_mov_b32_e32 v54, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 -; SI-NEXT: v_or_b32_e32 v2, v60, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_or_b32_e32 v12, v61, v12 ; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_or_b32_e32 v13, v59, v13 ; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_or_b32_e32 v13, v57, v13 ; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_or_b32_e32 v14, v47, v14 ; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v45, v15 ; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v15, v43, v15 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v40, v17 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_or_b32_e32 v18, v55, v18 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_mov_b32_e32 v42, v55 +; SI-NEXT: v_or_b32_e32 v17, v55, v17 +; SI-NEXT: v_mov_b32_e32 v33, v40 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 ; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_mov_b32_e32 v32, v35 +; SI-NEXT: v_or_b32_e32 v31, v63, v31 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB23_3 ; SI-NEXT: .LBB23_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v62, v61 ; SI-NEXT: v_mov_b32_e32 v60, v59 ; SI-NEXT: v_mov_b32_e32 v58, v57 ; SI-NEXT: v_mov_b32_e32 v56, v47 ; SI-NEXT: v_mov_b32_e32 v46, v45 ; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v42, v55 +; SI-NEXT: v_mov_b32_e32 v33, v40 +; SI-NEXT: v_mov_b32_e32 v36, v54 ; SI-NEXT: v_mov_b32_e32 v54, v20 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v34, v53 +; SI-NEXT: v_mov_b32_e32 v32, v35 +; SI-NEXT: v_mov_b32_e32 v53, v21 +; SI-NEXT: v_mov_b32_e32 v52, v22 ; SI-NEXT: v_mov_b32_e32 v51, v23 ; SI-NEXT: v_mov_b32_e32 v50, v24 ; SI-NEXT: v_mov_b32_e32 v49, v25 @@ -34497,298 +34403,290 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB23_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v35, v40 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v40, v46 -; SI-NEXT: v_mov_b32_e32 v41, v56 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v43, v60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v63 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v42, v56 +; SI-NEXT: v_mov_b32_e32 v43, v58 +; SI-NEXT: v_mov_b32_e32 v44, v60 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB23_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v47 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_mov_b32_e32 v55, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v55 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v34 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v58 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v59 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v35 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 @@ -34801,7 +34699,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 @@ -34818,7 +34716,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 ; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -34826,7 +34724,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 @@ -34856,7 +34754,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v32i32_scalar: @@ -35087,252 +34985,214 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v160, v13 :: v_dual_mov_b32 v161, v12 +; GFX11-NEXT: v_dual_mov_b32 v162, v11 :: v_dual_mov_b32 v163, v10 +; GFX11-NEXT: v_dual_mov_b32 v164, v9 :: v_dual_mov_b32 v165, v8 +; GFX11-NEXT: v_dual_mov_b32 v166, v7 :: v_dual_mov_b32 v167, v6 +; GFX11-NEXT: v_dual_mov_b32 v176, v5 :: v_dual_mov_b32 v177, v4 +; GFX11-NEXT: v_dual_mov_b32 v178, v3 :: v_dual_mov_b32 v179, v2 +; GFX11-NEXT: v_dual_mov_b32 v180, v1 :: v_dual_mov_b32 v181, v0 +; GFX11-NEXT: v_dual_mov_b32 v182, s28 :: v_dual_mov_b32 v183, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:96 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 ; GFX11-NEXT: s_cbranch_scc0 .LBB23_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v27, s18 +; GFX11-NEXT: v_dual_mov_b32 v20, s17 :: v_dual_mov_b32 v35, s19 +; GFX11-NEXT: v_dual_mov_b32 v44, s20 :: v_dual_mov_b32 v65, s22 +; GFX11-NEXT: v_dual_mov_b32 v54, s21 :: v_dual_mov_b32 v77, s23 +; GFX11-NEXT: v_dual_mov_b32 v90, s24 :: v_dual_mov_b32 v119, s26 +; GFX11-NEXT: v_dual_mov_b32 v104, s25 :: v_dual_mov_b32 v135, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB23_3 ; GFX11-NEXT: .LBB23_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v135, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v119, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v104, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v90, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v77, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v65, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v54, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v44, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v35, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v160, 0x200, v160 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v161, 0x200, v161 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v162, 0x200, v162 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v163, 0x200, v163 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v164, 0x200, v164 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v165, 0x200, v165 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v166, 0x200, v166 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v167, 0x200, v167 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB23_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 +; GFX11-NEXT: v_mov_b32_e32 v13, v104 ; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:220 +; GFX11-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14 +; GFX11-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 +; GFX11-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65 +; GFX11-NEXT: v_mov_b32_e32 v14, v119 +; GFX11-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v182 +; GFX11-NEXT: v_dual_mov_b32 v17, v183 :: v_dual_mov_b32 v18, v181 +; GFX11-NEXT: v_dual_mov_b32 v19, v180 :: v_dual_mov_b32 v20, v179 +; GFX11-NEXT: v_dual_mov_b32 v21, v178 :: v_dual_mov_b32 v22, v177 +; GFX11-NEXT: v_dual_mov_b32 v23, v176 :: v_dual_mov_b32 v24, v167 +; GFX11-NEXT: v_dual_mov_b32 v25, v166 :: v_dual_mov_b32 v26, v165 +; GFX11-NEXT: v_dual_mov_b32 v27, v164 :: v_dual_mov_b32 v28, v163 +; GFX11-NEXT: v_dual_mov_b32 v29, v162 :: v_dual_mov_b32 v30, v161 +; GFX11-NEXT: v_mov_b32_e32 v31, v160 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB23_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 ; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109 +; GFX11-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122 +; GFX11-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136 +; GFX11-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151 ; GFX11-NEXT: s_branch .LBB23_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -36065,6 +35925,12 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[76:77], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 16 ; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 ; SI-NEXT: s_lshr_b32 s38, s5, 16 ; SI-NEXT: s_lshr_b32 s39, s7, 16 ; SI-NEXT: s_lshr_b32 s48, s9, 16 @@ -36081,50 +35947,47 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s67, s45, 16 ; SI-NEXT: s_lshr_b32 s68, s47, 16 ; SI-NEXT: s_lshr_b32 s69, s57, 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 ; SI-NEXT: .LBB25_3: ; %end -; SI-NEXT: s_lshl_b32 s27, s36, 16 -; SI-NEXT: s_and_b32 s29, s56, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s56, 0xffff +; SI-NEXT: s_lshl_b32 s29, s36, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v1, s27 ; SI-NEXT: s_and_b32 s27, s57, 0xffff ; SI-NEXT: s_lshl_b32 s29, s69, 16 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s34, 16 -; SI-NEXT: s_and_b32 s29, s46, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: s_and_b32 s27, s47, 0xffff -; SI-NEXT: s_lshl_b32 s29, s68, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: s_lshl_b32 s27, s30, 16 -; SI-NEXT: s_and_b32 s29, s44, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s46, 0xffff +; SI-NEXT: s_lshl_b32 s29, s34, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v5, s27 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s47, 0xffff +; SI-NEXT: s_lshl_b32 s29, s68, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_and_b32 s27, s45, 0xffff -; SI-NEXT: s_lshl_b32 s29, s67, 16 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s44, 0xffff +; SI-NEXT: s_lshl_b32 s29, s30, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s45, 0xffff +; SI-NEXT: s_lshl_b32 s29, s67, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s27 ; SI-NEXT: s_and_b32 s27, s42, 0xffff ; SI-NEXT: s_lshl_b32 s29, s94, 16 -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -36360,12 +36223,12 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr39 -; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: s_branch .LBB25_2 ; ; VI-LABEL: bitcast_v32i32_to_v64i16_scalar: @@ -36677,10 +36540,10 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(13) @@ -36900,8 +36763,8 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v11, v11, v44 ; SI-NEXT: v_or_b32_e32 v12, v12, v43 ; SI-NEXT: v_or_b32_e32 v13, v13, v42 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v55 +; SI-NEXT: v_or_b32_e32 v14, v14, v55 +; SI-NEXT: v_or_b32_e32 v15, v15, v40 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr52 @@ -36922,8 +36785,8 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 @@ -37101,8 +36964,8 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v11, v44, v11 ; SI-NEXT: v_or_b32_e32 v12, v43, v12 ; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v55, v15 +; SI-NEXT: v_or_b32_e32 v14, v55, v14 +; SI-NEXT: v_or_b32_e32 v15, v40, v15 ; SI-NEXT: v_or_b32_e32 v19, v39, v19 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -37433,222 +37296,207 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v12 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v35, v8 -; SI-NEXT: v_mov_b32_e32 v38, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_mov_b32_e32 v46, v10 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v32 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v55 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v54 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB27_2 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v7, v0, v48 +; SI-NEXT: v_mov_b32_e32 v60, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v8, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v39 +; SI-NEXT: v_or_b32_e32 v9, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v11, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v12, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v13, v0, v44 +; SI-NEXT: v_or_b32_e32 v10, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v13, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v43 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v16, v0, v42 +; SI-NEXT: v_or_b32_e32 v16, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v18, v0, v41 +; SI-NEXT: v_or_b32_e32 v18, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v19, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v20, v0, v37 +; SI-NEXT: v_or_b32_e32 v20, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v22, v0, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 ; SI-NEXT: v_or_b32_e32 v24, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 ; SI-NEXT: v_or_b32_e32 v27, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v28, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v29, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: s_or_b32 s12, s4, s5 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_or_b32_e32 v8, v1, v56 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v55, v61 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: v_or_b32_e32 v31, v0, v31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_branch .LBB27_3 -; SI-NEXT: .LBB27_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB27_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v58, v49 -; SI-NEXT: s_cbranch_vccnz .LBB27_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_mov_b32_e32 v54, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v48 +; SI-NEXT: v_mov_b32_e32 v61, v39 +; SI-NEXT: v_mov_b32_e32 v59, v37 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v53, v35 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v63, v56 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -37693,139 +37541,134 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -37834,7 +37677,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB27_5: ; %end +; SI-NEXT: .LBB27_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -37851,8 +37694,26 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB27_4: +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v48 +; SI-NEXT: v_mov_b32_e32 v61, v39 +; SI-NEXT: v_mov_b32_e32 v60, v0 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v59, v37 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v54, v4 +; SI-NEXT: v_mov_b32_e32 v53, v35 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v63, v56 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB27_2 ; ; VI-LABEL: bitcast_v64i16_to_v32i32_scalar: ; VI: ; %bb.0: @@ -38189,252 +38050,214 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v160, v13 :: v_dual_mov_b32 v161, v12 +; GFX11-NEXT: v_dual_mov_b32 v162, v11 :: v_dual_mov_b32 v163, v10 +; GFX11-NEXT: v_dual_mov_b32 v164, v9 :: v_dual_mov_b32 v165, v8 +; GFX11-NEXT: v_dual_mov_b32 v166, v7 :: v_dual_mov_b32 v167, v6 +; GFX11-NEXT: v_dual_mov_b32 v176, v5 :: v_dual_mov_b32 v177, v4 +; GFX11-NEXT: v_dual_mov_b32 v178, v3 :: v_dual_mov_b32 v179, v2 +; GFX11-NEXT: v_dual_mov_b32 v180, v1 :: v_dual_mov_b32 v181, v0 +; GFX11-NEXT: v_dual_mov_b32 v182, s28 :: v_dual_mov_b32 v183, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:96 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 ; GFX11-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v27, s18 +; GFX11-NEXT: v_dual_mov_b32 v20, s17 :: v_dual_mov_b32 v35, s19 +; GFX11-NEXT: v_dual_mov_b32 v44, s20 :: v_dual_mov_b32 v65, s22 +; GFX11-NEXT: v_dual_mov_b32 v54, s21 :: v_dual_mov_b32 v77, s23 +; GFX11-NEXT: v_dual_mov_b32 v90, s24 :: v_dual_mov_b32 v119, s26 +; GFX11-NEXT: v_dual_mov_b32 v104, s25 :: v_dual_mov_b32 v135, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB27_3 ; GFX11-NEXT: .LBB27_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v135, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v119, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v104, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v90, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v77, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v65, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v54, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v44, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v35, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v160, v160, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v161, v161, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v162, v162, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v163, v163, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v164, v164, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v165, v165, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v166, v166, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v167, v167, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB27_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 +; GFX11-NEXT: v_mov_b32_e32 v13, v104 ; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:220 +; GFX11-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14 +; GFX11-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 +; GFX11-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65 +; GFX11-NEXT: v_mov_b32_e32 v14, v119 +; GFX11-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v182 +; GFX11-NEXT: v_dual_mov_b32 v17, v183 :: v_dual_mov_b32 v18, v181 +; GFX11-NEXT: v_dual_mov_b32 v19, v180 :: v_dual_mov_b32 v20, v179 +; GFX11-NEXT: v_dual_mov_b32 v21, v178 :: v_dual_mov_b32 v22, v177 +; GFX11-NEXT: v_dual_mov_b32 v23, v176 :: v_dual_mov_b32 v24, v167 +; GFX11-NEXT: v_dual_mov_b32 v25, v166 :: v_dual_mov_b32 v26, v165 +; GFX11-NEXT: v_dual_mov_b32 v27, v164 :: v_dual_mov_b32 v28, v163 +; GFX11-NEXT: v_dual_mov_b32 v29, v162 :: v_dual_mov_b32 v30, v161 +; GFX11-NEXT: v_mov_b32_e32 v31, v160 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB27_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 ; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109 +; GFX11-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122 +; GFX11-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136 +; GFX11-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151 ; GFX11-NEXT: s_branch .LBB27_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -40202,6 +40025,7 @@ define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg % ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 @@ -40212,7 +40036,6 @@ define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg % ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 @@ -40266,6 +40089,7 @@ define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg % ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 @@ -40276,7 +40100,6 @@ define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg % ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB35_4 @@ -40330,6 +40153,7 @@ define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg % ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 @@ -40340,7 +40164,6 @@ define inreg <32 x float> @bitcast_v16f64_to_v32f32_scalar(<16 x double> inreg % ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 @@ -41653,107 +41476,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -41770,258 +41492,362 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB36_2 ; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; VI-NEXT: v_mov_b32_e32 v55, v39 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 ; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v8 ; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v7 ; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v5 ; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v2 ; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v1 ; VI-NEXT: .LBB36_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB36_4 @@ -42084,273 +41910,272 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v32 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 ; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v8 ; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v7 ; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v5 ; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v2 ; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v1 ; VI-NEXT: .LBB36_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v48 -; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v57 -; VI-NEXT: v_or_b32_sdwa v2, v2, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v41 -; VI-NEXT: v_or_b32_sdwa v48, v53, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v41 +; VI-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v38, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v47 +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38 -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 -; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v61 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -42361,23 +42186,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -42388,23 +42212,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -42415,23 +42238,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -42442,10 +42264,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -42455,9 +42277,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -42468,10 +42290,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -42482,9 +42304,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -42495,10 +42317,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -42509,9 +42331,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -42522,21 +42344,23 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -42547,10 +42371,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -42561,9 +42385,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -42574,10 +42398,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -42588,13 +42412,15 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -42639,9 +42465,69 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -42652,6 +42538,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -42662,6 +42552,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -42672,6 +42566,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -42682,6 +42580,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -42692,6 +42594,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -42702,6 +42608,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -42712,6 +42622,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 @@ -42722,316 +42636,230 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; kill: killed $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; kill: killed $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(45) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB36_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(47) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: s_waitcnt vmcnt(48) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[19:20] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[17:18] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v1 ; GFX9-NEXT: .LBB36_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB36_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 -; GFX9-NEXT: s_waitcnt vmcnt(44) +; GFX9-NEXT: s_waitcnt vmcnt(46) ; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 @@ -43112,342 +42940,334 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 -; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v1 ; GFX9-NEXT: .LBB36_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v52 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v36 -; GFX9-NEXT: v_or_b32_sdwa v12, v12, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v50 -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v57 -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v56 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v42 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v38 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v47 -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v46 -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v55 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v54 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v41 -; GFX9-NEXT: v_or_b32_sdwa v33, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v38, v38, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 -; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v61, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -43457,11 +43277,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -43470,10 +43290,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -43483,11 +43303,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -43496,10 +43316,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -43509,11 +43329,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -43522,10 +43342,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -43535,11 +43355,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -43548,10 +43368,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -43561,11 +43381,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -43574,10 +43394,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -43587,11 +43407,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -43600,17 +43420,15 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -43871,17 +43689,17 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v161.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l @@ -44075,28 +43893,27 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 @@ -44182,6 +43999,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 @@ -44190,7 +44008,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 @@ -44199,18 +44016,19 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 @@ -44292,10 +44110,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: .LBB36_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB36_4 @@ -44412,56 +44229,52 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: .LBB36_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v63 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v61 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v66 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v67, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v65 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v61 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v62 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v67, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v59 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v58 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v57 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v56 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v47 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v46 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v45 @@ -44470,22 +44283,26 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v64 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v65, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v44 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v64 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v65, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v42 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v41 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v40 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v183 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 @@ -44715,27 +44532,26 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:84 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -44866,120 +44682,109 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB37_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s4, s7, 24 +; SI-NEXT: s_lshr_b32 s4, s25, 24 ; SI-NEXT: v_writelane_b32 v62, s4, 17 -; SI-NEXT: s_lshr_b32 s4, s9, 24 +; SI-NEXT: s_lshr_b32 s4, s25, 16 ; SI-NEXT: v_writelane_b32 v62, s4, 16 -; SI-NEXT: s_lshr_b32 s4, s11, 24 +; SI-NEXT: s_lshr_b32 s4, s41, 24 ; SI-NEXT: v_writelane_b32 v62, s4, 15 -; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: v_writelane_b32 v62, s4, 14 -; SI-NEXT: s_lshr_b32 s4, s13, 24 +; SI-NEXT: s_lshr_b32 s4, s41, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 13 -; SI-NEXT: s_lshr_b32 s4, s13, 16 +; SI-NEXT: s_lshr_b32 s4, s43, 24 ; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: s_lshr_b32 s4, s13, 8 +; SI-NEXT: s_lshr_b32 s4, s43, 16 ; SI-NEXT: v_writelane_b32 v62, s4, 11 -; SI-NEXT: s_lshr_b32 s4, s15, 24 +; SI-NEXT: s_lshr_b32 s4, s43, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 10 -; SI-NEXT: s_lshr_b32 s4, s15, 16 +; SI-NEXT: s_lshr_b32 s4, s45, 24 ; SI-NEXT: v_writelane_b32 v62, s4, 9 -; SI-NEXT: s_lshr_b32 s4, s15, 8 +; SI-NEXT: s_lshr_b32 s4, s45, 16 ; SI-NEXT: v_writelane_b32 v62, s4, 8 -; SI-NEXT: s_lshr_b32 s4, s17, 24 +; SI-NEXT: s_lshr_b32 s4, s45, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 7 -; SI-NEXT: s_lshr_b32 s4, s17, 16 +; SI-NEXT: s_lshr_b32 s4, s47, 24 ; SI-NEXT: v_writelane_b32 v62, s4, 6 -; SI-NEXT: s_lshr_b32 s4, s17, 8 +; SI-NEXT: s_lshr_b32 s4, s47, 16 ; SI-NEXT: v_writelane_b32 v62, s4, 5 -; SI-NEXT: s_lshr_b32 s4, s19, 24 +; SI-NEXT: s_lshr_b32 s4, s47, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 4 -; SI-NEXT: s_lshr_b32 s4, s19, 16 +; SI-NEXT: s_lshr_b32 s4, s57, 24 ; SI-NEXT: v_writelane_b32 v62, s4, 3 -; SI-NEXT: s_lshr_b32 s4, s19, 8 +; SI-NEXT: s_lshr_b32 s4, s57, 16 ; SI-NEXT: v_writelane_b32 v62, s4, 2 -; SI-NEXT: s_lshr_b32 s4, s21, 24 +; SI-NEXT: s_lshr_b32 s4, s57, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 1 -; SI-NEXT: s_lshr_b32 s4, s21, 16 +; SI-NEXT: s_lshr_b32 s4, s59, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 0 -; SI-NEXT: s_lshr_b32 s4, s21, 8 -; SI-NEXT: v_writelane_b32 v61, s4, 63 -; SI-NEXT: s_lshr_b32 s4, s23, 24 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], 24 ; SI-NEXT: v_writelane_b32 v61, s4, 62 -; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: v_writelane_b32 v61, s4, 61 -; SI-NEXT: s_lshr_b32 s4, s23, 8 +; SI-NEXT: v_writelane_b32 v61, s5, 63 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], 16 ; SI-NEXT: v_writelane_b32 v61, s4, 60 -; SI-NEXT: s_lshr_b32 s4, s25, 24 -; SI-NEXT: v_writelane_b32 v61, s4, 59 -; SI-NEXT: s_lshr_b32 s4, s25, 16 +; SI-NEXT: v_writelane_b32 v61, s5, 61 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], 8 ; SI-NEXT: v_writelane_b32 v61, s4, 58 -; SI-NEXT: s_lshr_b32 s4, s25, 8 -; SI-NEXT: v_writelane_b32 v61, s4, 57 -; SI-NEXT: s_lshr_b32 s4, s41, 24 +; SI-NEXT: v_writelane_b32 v61, s5, 59 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 ; SI-NEXT: v_writelane_b32 v61, s4, 56 -; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_writelane_b32 v61, s4, 55 -; SI-NEXT: s_lshr_b32 s4, s41, 8 +; SI-NEXT: v_writelane_b32 v61, s5, 57 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 16 ; SI-NEXT: v_writelane_b32 v61, s4, 54 -; SI-NEXT: s_lshr_b32 s4, s43, 24 -; SI-NEXT: v_writelane_b32 v61, s4, 53 -; SI-NEXT: s_lshr_b32 s4, s43, 16 +; SI-NEXT: v_writelane_b32 v61, s5, 55 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 8 ; SI-NEXT: v_writelane_b32 v61, s4, 52 -; SI-NEXT: s_lshr_b32 s4, s43, 8 -; SI-NEXT: v_writelane_b32 v61, s4, 51 -; SI-NEXT: s_lshr_b32 s4, s45, 24 +; SI-NEXT: v_writelane_b32 v61, s5, 53 +; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 24 ; SI-NEXT: v_writelane_b32 v61, s4, 50 -; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_writelane_b32 v61, s4, 49 -; SI-NEXT: s_lshr_b32 s4, s45, 8 +; SI-NEXT: v_writelane_b32 v61, s5, 51 +; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 16 ; SI-NEXT: v_writelane_b32 v61, s4, 48 -; SI-NEXT: s_lshr_b32 s4, s47, 24 -; SI-NEXT: v_writelane_b32 v61, s4, 47 -; SI-NEXT: s_lshr_b32 s4, s47, 16 +; SI-NEXT: v_writelane_b32 v61, s5, 49 +; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 8 ; SI-NEXT: v_writelane_b32 v61, s4, 46 -; SI-NEXT: s_lshr_b32 s4, s47, 8 -; SI-NEXT: v_writelane_b32 v61, s4, 45 -; SI-NEXT: s_lshr_b32 s4, s57, 16 +; SI-NEXT: v_writelane_b32 v61, s5, 47 +; SI-NEXT: s_lshr_b64 s[4:5], s[12:13], 24 ; SI-NEXT: v_writelane_b32 v61, s4, 44 -; SI-NEXT: s_lshr_b32 s4, s57, 8 -; SI-NEXT: v_writelane_b32 v61, s4, 43 -; SI-NEXT: s_lshr_b32 s4, s59, 8 +; SI-NEXT: v_writelane_b32 v61, s5, 45 +; SI-NEXT: s_lshr_b64 s[4:5], s[12:13], 16 ; SI-NEXT: v_writelane_b32 v61, s4, 42 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v61, s5, 43 +; SI-NEXT: s_lshr_b64 s[4:5], s[12:13], 8 ; SI-NEXT: v_writelane_b32 v61, s4, 40 ; SI-NEXT: v_writelane_b32 v61, s5, 41 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[14:15], 24 ; SI-NEXT: v_writelane_b32 v61, s4, 38 ; SI-NEXT: v_writelane_b32 v61, s5, 39 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[14:15], 16 ; SI-NEXT: v_writelane_b32 v61, s4, 36 ; SI-NEXT: v_writelane_b32 v61, s5, 37 -; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[4:5], s[14:15], 8 ; SI-NEXT: v_writelane_b32 v61, s4, 34 ; SI-NEXT: v_writelane_b32 v61, s5, 35 -; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; SI-NEXT: v_writelane_b32 v61, s4, 32 ; SI-NEXT: v_writelane_b32 v61, s5, 33 -; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: v_writelane_b32 v61, s4, 30 ; SI-NEXT: v_writelane_b32 v61, s5, 31 -; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 8 ; SI-NEXT: v_writelane_b32 v61, s4, 28 ; SI-NEXT: v_writelane_b32 v61, s5, 29 -; SI-NEXT: s_lshr_b64 s[4:5], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 ; SI-NEXT: v_writelane_b32 v61, s4, 26 ; SI-NEXT: v_writelane_b32 v61, s5, 27 -; SI-NEXT: s_lshr_b64 s[4:5], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 ; SI-NEXT: v_writelane_b32 v61, s4, 24 ; SI-NEXT: v_writelane_b32 v61, s5, 25 -; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 8 ; SI-NEXT: v_writelane_b32 v61, s4, 22 ; SI-NEXT: v_writelane_b32 v61, s5, 23 -; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 ; SI-NEXT: v_writelane_b32 v61, s4, 20 ; SI-NEXT: v_writelane_b32 v61, s5, 21 -; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 ; SI-NEXT: v_writelane_b32 v61, s4, 18 ; SI-NEXT: v_writelane_b32 v61, s5, 19 ; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 8 @@ -45008,31 +44813,42 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_writelane_b32 v61, s5, 3 ; SI-NEXT: s_lshr_b64 s[4:5], s[58:59], 24 ; SI-NEXT: v_writelane_b32 v61, s4, 0 -; SI-NEXT: s_lshr_b32 s27, s7, 16 -; SI-NEXT: s_lshr_b32 s29, s7, 8 -; SI-NEXT: s_lshr_b32 s61, s9, 16 -; SI-NEXT: s_lshr_b32 s26, s9, 8 -; SI-NEXT: s_lshr_b32 s28, s11, 8 -; SI-NEXT: s_lshr_b32 s60, s57, 24 +; SI-NEXT: s_lshr_b32 s34, s7, 24 +; SI-NEXT: s_lshr_b32 s90, s7, 16 +; SI-NEXT: s_lshr_b32 s91, s7, 8 +; SI-NEXT: s_lshr_b32 s79, s9, 24 +; SI-NEXT: s_lshr_b32 s89, s9, 16 +; SI-NEXT: s_lshr_b32 s35, s9, 8 +; SI-NEXT: s_lshr_b32 s76, s11, 24 +; SI-NEXT: s_lshr_b32 s78, s11, 16 +; SI-NEXT: s_lshr_b32 s88, s11, 8 +; SI-NEXT: s_lshr_b32 s73, s13, 24 +; SI-NEXT: s_lshr_b32 s75, s13, 16 +; SI-NEXT: s_lshr_b32 s77, s13, 8 +; SI-NEXT: s_lshr_b32 s62, s15, 24 +; SI-NEXT: s_lshr_b32 s72, s15, 16 +; SI-NEXT: s_lshr_b32 s74, s15, 8 +; SI-NEXT: s_lshr_b32 s29, s17, 24 +; SI-NEXT: s_lshr_b32 s61, s17, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: s_lshr_b32 s26, s19, 24 +; SI-NEXT: s_lshr_b32 s28, s19, 16 +; SI-NEXT: s_lshr_b32 s60, s19, 8 +; SI-NEXT: s_lshr_b32 s95, s21, 24 +; SI-NEXT: s_lshr_b32 s31, s21, 16 +; SI-NEXT: s_lshr_b32 s27, s21, 8 +; SI-NEXT: s_lshr_b32 s92, s23, 24 +; SI-NEXT: s_lshr_b32 s94, s23, 16 +; SI-NEXT: s_lshr_b32 s30, s23, 8 +; SI-NEXT: s_lshr_b32 s93, s25, 8 ; SI-NEXT: s_lshr_b32 s96, s59, 24 ; SI-NEXT: s_lshr_b32 s97, s59, 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[20:21], 16 ; SI-NEXT: s_lshr_b64 s[36:37], s[22:23], 24 ; SI-NEXT: s_lshr_b64 s[38:39], s[22:23], 16 ; SI-NEXT: s_lshr_b64 s[48:49], s[24:25], 24 ; SI-NEXT: s_lshr_b64 s[50:51], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[40:41], 24 -; SI-NEXT: s_lshr_b64 s[54:55], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[52:53], s[40:41], 16 ; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 24 ; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 16 ; SI-NEXT: s_lshr_b64 s[68:69], s[44:45], 24 @@ -45064,65 +44880,65 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f32_e64 v1, s7, 1.0 ; SI-NEXT: v_add_f32_e64 v2, s6, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s21, 1.0 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s6, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v1 -; SI-NEXT: v_add_f32_e64 v15, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s23, 1.0 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v13 +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v15 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v17 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; SI-NEXT: v_add_f32_e64 v17, s23, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_add_f32_e64 v19, s25, 1.0 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v15 +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v17 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v19 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 -; SI-NEXT: v_add_f32_e64 v19, s25, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_add_f32_e64 v21, s41, 1.0 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v17 +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v19 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v21 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 -; SI-NEXT: v_add_f32_e64 v21, s41, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_add_f32_e64 v23, s43, 1.0 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v21 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v23 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_add_f32_e64 v37, s59, 1.0 -; SI-NEXT: v_add_f32_e64 v48, s58, 1.0 -; SI-NEXT: v_add_f32_e64 v32, s57, 1.0 -; SI-NEXT: v_add_f32_e64 v36, s56, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s47, 1.0 -; SI-NEXT: v_add_f32_e64 v31, s46, 1.0 -; SI-NEXT: v_add_f32_e64 v25, s45, 1.0 -; SI-NEXT: v_add_f32_e64 v27, s44, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s43, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s42, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_add_f32_e64 v48, s59, 1.0 +; SI-NEXT: v_add_f32_e64 v49, s58, 1.0 +; SI-NEXT: v_add_f32_e64 v35, s57, 1.0 +; SI-NEXT: v_add_f32_e64 v39, s56, 1.0 +; SI-NEXT: v_add_f32_e64 v32, s47, 1.0 +; SI-NEXT: v_add_f32_e64 v34, s46, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s45, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s44, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s42, 1.0 ; SI-NEXT: v_add_f32_e64 v22, s40, 1.0 ; SI-NEXT: v_add_f32_e64 v20, s24, 1.0 ; SI-NEXT: v_add_f32_e64 v18, s22, 1.0 ; SI-NEXT: v_add_f32_e64 v16, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s19, 1.0 ; SI-NEXT: v_add_f32_e64 v14, s18, 1.0 ; SI-NEXT: v_add_f32_e64 v11, s17, 1.0 ; SI-NEXT: v_add_f32_e64 v12, s16, 1.0 @@ -45132,16 +44948,16 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v3, s9, 1.0 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v21 -; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_readfirstlane_b32 s5, v37 -; SI-NEXT: v_readfirstlane_b32 s56, v36 -; SI-NEXT: v_readfirstlane_b32 s57, v32 -; SI-NEXT: v_readfirstlane_b32 s46, v31 -; SI-NEXT: v_readfirstlane_b32 s47, v29 -; SI-NEXT: v_readfirstlane_b32 s44, v27 -; SI-NEXT: v_readfirstlane_b32 s45, v25 -; SI-NEXT: v_readfirstlane_b32 s42, v24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v23 +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_readfirstlane_b32 s5, v48 +; SI-NEXT: v_readfirstlane_b32 s56, v39 +; SI-NEXT: v_readfirstlane_b32 s57, v35 +; SI-NEXT: v_readfirstlane_b32 s46, v34 +; SI-NEXT: v_readfirstlane_b32 s47, v32 +; SI-NEXT: v_readfirstlane_b32 s44, v29 +; SI-NEXT: v_readfirstlane_b32 s45, v28 +; SI-NEXT: v_readfirstlane_b32 s42, v27 ; SI-NEXT: v_readfirstlane_b32 s43, v23 ; SI-NEXT: v_readfirstlane_b32 s40, v22 ; SI-NEXT: v_readfirstlane_b32 s41, v21 @@ -45162,7 +44978,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_readfirstlane_b32 s7, v1 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v28 ; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 ; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 8 @@ -45182,19 +44998,19 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 8 ; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[18:19], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[20:21], 16 ; SI-NEXT: s_lshr_b64 s[20:21], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[36:37], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[96:97], s[22:23], 24 ; SI-NEXT: s_lshr_b64 s[38:39], s[22:23], 16 ; SI-NEXT: s_lshr_b64 s[22:23], s[22:23], 8 ; SI-NEXT: s_lshr_b64 s[48:49], s[24:25], 24 ; SI-NEXT: s_lshr_b64 s[50:51], s[24:25], 16 ; SI-NEXT: s_lshr_b64 s[24:25], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[52:53], s[40:41], 24 -; SI-NEXT: s_lshr_b64 s[54:55], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[52:53], s[40:41], 16 ; SI-NEXT: s_lshr_b64 s[40:41], s[40:41], 8 ; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 24 ; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 16 @@ -45224,182 +45040,166 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v7 ; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v9 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v13 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v30, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v13 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v38, 8, v23 -; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v25 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 24, v29 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v29 -; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v53, 8, v32 -; SI-NEXT: v_lshrrev_b32_e32 v54, 24, v37 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v37 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v32 +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v35 +; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v48 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v48 ; SI-NEXT: s_branch .LBB37_5 ; SI-NEXT: .LBB37_3: -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 0 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: v_writelane_b32 v61, s61, 1 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 2 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 3 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 4 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 5 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 6 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 7 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 8 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 9 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 10 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 11 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 12 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 13 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 14 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 15 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 16 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 17 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 18 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 19 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 20 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 21 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 22 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 23 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 24 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 25 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 26 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 27 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 28 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 29 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 30 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 31 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 32 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 33 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 34 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 35 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 36 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 37 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 38 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 39 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 40 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 41 -; SI-NEXT: ; implicit-def: $sgpr97 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: v_writelane_b32 v61, s26, 0 +; SI-NEXT: v_writelane_b32 v61, s27, 1 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v61, s26, 2 +; SI-NEXT: v_writelane_b32 v61, s27, 3 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v61, s26, 4 +; SI-NEXT: v_writelane_b32 v61, s27, 5 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v61, s26, 6 +; SI-NEXT: v_writelane_b32 v61, s27, 7 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v61, s26, 8 +; SI-NEXT: v_writelane_b32 v61, s27, 9 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v61, s26, 10 +; SI-NEXT: v_writelane_b32 v61, s27, 11 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v61, s26, 12 +; SI-NEXT: v_writelane_b32 v61, s27, 13 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v61, s26, 14 +; SI-NEXT: v_writelane_b32 v61, s27, 15 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v61, s26, 16 +; SI-NEXT: v_writelane_b32 v61, s27, 17 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v61, s26, 18 +; SI-NEXT: v_writelane_b32 v61, s27, 19 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v61, s26, 20 +; SI-NEXT: v_writelane_b32 v61, s27, 21 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v61, s26, 22 ; SI-NEXT: ; implicit-def: $sgpr27 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s27, 23 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v61, s26, 24 +; SI-NEXT: v_writelane_b32 v61, s27, 25 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v61, s26, 26 +; SI-NEXT: v_writelane_b32 v61, s27, 27 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: v_writelane_b32 v61, s62, 28 +; SI-NEXT: v_writelane_b32 v61, s63, 29 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: v_writelane_b32 v61, s62, 30 +; SI-NEXT: v_writelane_b32 v61, s63, 31 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: v_writelane_b32 v61, s62, 32 +; SI-NEXT: v_writelane_b32 v61, s63, 33 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: v_writelane_b32 v61, s62, 34 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: v_writelane_b32 v61, s63, 35 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: v_writelane_b32 v61, s62, 36 +; SI-NEXT: v_writelane_b32 v61, s63, 37 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: v_writelane_b32 v61, s62, 38 +; SI-NEXT: v_writelane_b32 v61, s63, 39 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: v_writelane_b32 v61, s76, 40 +; SI-NEXT: v_writelane_b32 v61, s77, 41 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: v_writelane_b32 v61, s76, 42 +; SI-NEXT: v_writelane_b32 v61, s77, 43 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: v_writelane_b32 v61, s76, 44 +; SI-NEXT: v_writelane_b32 v61, s77, 45 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: v_writelane_b32 v61, s76, 46 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: v_writelane_b32 v61, s77, 47 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: v_writelane_b32 v61, s76, 48 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: v_writelane_b32 v61, s77, 49 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: v_writelane_b32 v61, s76, 50 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: v_writelane_b32 v61, s77, 51 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: v_writelane_b32 v61, s90, 52 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: v_writelane_b32 v61, s91, 53 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: v_writelane_b32 v61, s90, 54 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: v_writelane_b32 v61, s91, 55 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: v_writelane_b32 v61, s90, 56 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: v_writelane_b32 v61, s91, 57 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: v_writelane_b32 v61, s90, 58 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: v_writelane_b32 v61, s91, 59 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: v_writelane_b32 v61, s90, 60 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: v_writelane_b32 v61, s91, 61 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: v_writelane_b32 v61, s90, 62 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: v_writelane_b32 v61, s91, 63 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr84 ; SI-NEXT: ; implicit-def: $sgpr82 @@ -45408,154 +45208,130 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; kill: killed $sgpr5 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: s_branch .LBB37_2 ; SI-NEXT: .LBB37_4: ; SI-NEXT: v_mov_b32_e32 v1, s58 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s5, v61, 42 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s5, v62, 0 ; SI-NEXT: v_mov_b32_e32 v10, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 43 +; SI-NEXT: v_readlane_b32 s5, v62, 1 +; SI-NEXT: v_mov_b32_e32 v54, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 2 ; SI-NEXT: v_mov_b32_e32 v53, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 44 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 45 +; SI-NEXT: v_readlane_b32 s5, v62, 3 ; SI-NEXT: v_mov_b32_e32 v6, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 46 +; SI-NEXT: v_readlane_b32 s5, v62, 4 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 5 +; SI-NEXT: v_mov_b32_e32 v52, s5 +; SI-NEXT: v_readlane_b32 s5, v62, 6 ; SI-NEXT: v_mov_b32_e32 v51, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 47 +; SI-NEXT: v_readlane_b32 s5, v62, 7 ; SI-NEXT: v_mov_b32_e32 v50, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 48 -; SI-NEXT: v_mov_b32_e32 v4, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 49 -; SI-NEXT: v_mov_b32_e32 v49, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 50 -; SI-NEXT: v_mov_b32_e32 v39, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 51 -; SI-NEXT: v_mov_b32_e32 v38, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 52 +; SI-NEXT: v_readlane_b32 s5, v62, 8 ; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 53 +; SI-NEXT: v_readlane_b32 s5, v62, 9 ; SI-NEXT: v_mov_b32_e32 v12, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 54 +; SI-NEXT: v_readlane_b32 s5, v62, 10 ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v12, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 55 +; SI-NEXT: v_readlane_b32 s5, v62, 11 ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v12, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 56 +; SI-NEXT: v_readlane_b32 s5, v62, 12 ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v12, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 57 +; SI-NEXT: v_readlane_b32 s5, v62, 13 ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v12, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 58 +; SI-NEXT: v_readlane_b32 s5, v62, 14 ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v12, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 59 +; SI-NEXT: v_readlane_b32 s5, v62, 15 ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v12, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 60 ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 61 +; SI-NEXT: v_mov_b32_e32 v12, s93 +; SI-NEXT: v_readlane_b32 s5, v62, 16 ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v12, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 62 +; SI-NEXT: v_readlane_b32 s5, v62, 17 +; SI-NEXT: v_mov_b32_e32 v1, s14 ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v12, s5 -; SI-NEXT: v_readlane_b32 s5, v61, 63 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 0 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 1 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 2 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 3 -; SI-NEXT: v_mov_b32_e32 v35, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 4 -; SI-NEXT: v_mov_b32_e32 v34, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 5 -; SI-NEXT: v_mov_b32_e32 v33, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 6 -; SI-NEXT: v_mov_b32_e32 v30, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 7 -; SI-NEXT: v_mov_b32_e32 v28, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 8 -; SI-NEXT: v_mov_b32_e32 v26, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 9 -; SI-NEXT: v_mov_b32_e32 v60, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 10 -; SI-NEXT: v_mov_b32_e32 v59, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 11 -; SI-NEXT: v_mov_b32_e32 v58, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 12 -; SI-NEXT: v_mov_b32_e32 v57, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 13 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_mov_b32_e32 v56, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 14 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s12 -; SI-NEXT: v_mov_b32_e32 v46, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 15 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s30 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_mov_b32_e32 v45, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 16 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s94 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s8 -; SI-NEXT: v_mov_b32_e32 v42, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 17 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s92 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, s5 -; SI-NEXT: v_mov_b32_e32 v37, s59 -; SI-NEXT: v_mov_b32_e32 v32, s57 -; SI-NEXT: v_mov_b32_e32 v29, s47 -; SI-NEXT: v_mov_b32_e32 v25, s45 +; SI-NEXT: v_mov_b32_e32 v12, s27 +; SI-NEXT: v_mov_b32_e32 v40, s34 +; SI-NEXT: v_mov_b32_e32 v48, s59 +; SI-NEXT: v_mov_b32_e32 v35, s57 +; SI-NEXT: v_mov_b32_e32 v32, s47 +; SI-NEXT: v_mov_b32_e32 v28, s45 ; SI-NEXT: v_mov_b32_e32 v23, s43 ; SI-NEXT: v_mov_b32_e32 v21, s41 ; SI-NEXT: v_mov_b32_e32 v19, s25 @@ -45570,51 +45346,90 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v55, s97 -; SI-NEXT: v_mov_b32_e32 v54, s96 -; SI-NEXT: v_mov_b32_e32 v52, s60 -; SI-NEXT: v_mov_b32_e32 v47, s28 -; SI-NEXT: v_mov_b32_e32 v44, s26 -; SI-NEXT: v_mov_b32_e32 v43, s61 -; SI-NEXT: v_mov_b32_e32 v41, s29 -; SI-NEXT: v_mov_b32_e32 v40, s27 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v8, s97 +; SI-NEXT: v_mov_b32_e32 v55, s96 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v38, s31 +; SI-NEXT: v_mov_b32_e32 v37, s95 +; SI-NEXT: v_mov_b32_e32 v36, s60 +; SI-NEXT: v_mov_b32_e32 v33, s28 +; SI-NEXT: v_mov_b32_e32 v31, s26 +; SI-NEXT: v_mov_b32_e32 v30, s63 +; SI-NEXT: v_mov_b32_e32 v26, s61 +; SI-NEXT: v_mov_b32_e32 v25, s29 +; SI-NEXT: v_mov_b32_e32 v24, s74 +; SI-NEXT: v_mov_b32_e32 v60, s72 +; SI-NEXT: v_mov_b32_e32 v59, s62 +; SI-NEXT: v_mov_b32_e32 v58, s77 +; SI-NEXT: v_mov_b32_e32 v57, s75 +; SI-NEXT: v_mov_b32_e32 v56, s73 +; SI-NEXT: v_mov_b32_e32 v47, s88 +; SI-NEXT: v_mov_b32_e32 v46, s78 +; SI-NEXT: v_mov_b32_e32 v45, s76 +; SI-NEXT: v_mov_b32_e32 v44, s35 +; SI-NEXT: v_mov_b32_e32 v43, s89 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, s90 +; SI-NEXT: v_mov_b32_e32 v41, s91 +; SI-NEXT: v_mov_b32_e32 v42, s79 ; SI-NEXT: v_mov_b32_e32 v12, s16 ; SI-NEXT: v_mov_b32_e32 v14, s18 ; SI-NEXT: v_mov_b32_e32 v16, s20 ; SI-NEXT: v_mov_b32_e32 v18, s22 ; SI-NEXT: v_mov_b32_e32 v20, s24 ; SI-NEXT: v_mov_b32_e32 v22, s40 -; SI-NEXT: v_mov_b32_e32 v24, s42 -; SI-NEXT: v_mov_b32_e32 v27, s44 -; SI-NEXT: v_mov_b32_e32 v31, s46 -; SI-NEXT: v_mov_b32_e32 v36, s56 -; SI-NEXT: v_readlane_b32 s26, v61, 40 -; SI-NEXT: v_readlane_b32 s27, v61, 41 -; SI-NEXT: v_readlane_b32 s28, v61, 38 -; SI-NEXT: v_readlane_b32 s29, v61, 39 -; SI-NEXT: v_readlane_b32 s6, v61, 36 -; SI-NEXT: v_readlane_b32 s7, v61, 37 -; SI-NEXT: v_readlane_b32 s58, v61, 34 -; SI-NEXT: v_readlane_b32 s59, v61, 35 -; SI-NEXT: v_readlane_b32 s60, v61, 32 -; SI-NEXT: v_readlane_b32 s61, v61, 33 -; SI-NEXT: v_readlane_b32 s8, v61, 30 -; SI-NEXT: v_readlane_b32 s9, v61, 31 -; SI-NEXT: v_readlane_b32 s10, v61, 28 -; SI-NEXT: v_readlane_b32 s11, v61, 29 -; SI-NEXT: v_readlane_b32 s12, v61, 26 -; SI-NEXT: v_readlane_b32 s13, v61, 27 -; SI-NEXT: v_readlane_b32 s14, v61, 24 -; SI-NEXT: v_readlane_b32 s15, v61, 25 -; SI-NEXT: v_readlane_b32 s16, v61, 22 -; SI-NEXT: v_readlane_b32 s17, v61, 23 -; SI-NEXT: s_mov_b32 s96, s94 -; SI-NEXT: v_readlane_b32 s94, v61, 20 -; SI-NEXT: v_readlane_b32 s95, v61, 21 -; SI-NEXT: v_readlane_b32 s18, v61, 18 -; SI-NEXT: v_readlane_b32 s19, v61, 19 +; SI-NEXT: v_mov_b32_e32 v27, s42 +; SI-NEXT: v_mov_b32_e32 v29, s44 +; SI-NEXT: v_mov_b32_e32 v34, s46 +; SI-NEXT: v_mov_b32_e32 v39, s56 +; SI-NEXT: v_readlane_b32 s26, v61, 62 +; SI-NEXT: v_readlane_b32 s27, v61, 63 +; SI-NEXT: v_readlane_b32 s28, v61, 60 +; SI-NEXT: v_readlane_b32 s29, v61, 61 +; SI-NEXT: v_readlane_b32 s6, v61, 58 +; SI-NEXT: v_readlane_b32 s7, v61, 59 +; SI-NEXT: v_readlane_b32 s58, v61, 56 +; SI-NEXT: v_readlane_b32 s59, v61, 57 +; SI-NEXT: v_readlane_b32 s60, v61, 54 +; SI-NEXT: v_readlane_b32 s61, v61, 55 +; SI-NEXT: v_readlane_b32 s8, v61, 52 +; SI-NEXT: v_readlane_b32 s9, v61, 53 +; SI-NEXT: v_readlane_b32 s62, v61, 50 +; SI-NEXT: v_readlane_b32 s63, v61, 51 +; SI-NEXT: v_readlane_b32 s72, v61, 48 +; SI-NEXT: v_readlane_b32 s73, v61, 49 +; SI-NEXT: v_readlane_b32 s10, v61, 46 +; SI-NEXT: v_readlane_b32 s11, v61, 47 +; SI-NEXT: v_readlane_b32 s74, v61, 44 +; SI-NEXT: v_readlane_b32 s75, v61, 45 +; SI-NEXT: v_readlane_b32 s76, v61, 42 +; SI-NEXT: v_readlane_b32 s77, v61, 43 +; SI-NEXT: v_readlane_b32 s12, v61, 40 +; SI-NEXT: v_readlane_b32 s13, v61, 41 +; SI-NEXT: v_readlane_b32 s78, v61, 38 +; SI-NEXT: v_readlane_b32 s79, v61, 39 +; SI-NEXT: v_readlane_b32 s88, v61, 36 +; SI-NEXT: v_readlane_b32 s89, v61, 37 +; SI-NEXT: v_readlane_b32 s14, v61, 34 +; SI-NEXT: v_readlane_b32 s15, v61, 35 +; SI-NEXT: v_readlane_b32 s90, v61, 32 +; SI-NEXT: v_readlane_b32 s91, v61, 33 +; SI-NEXT: v_readlane_b32 s92, v61, 30 +; SI-NEXT: v_readlane_b32 s93, v61, 31 +; SI-NEXT: v_readlane_b32 s16, v61, 28 +; SI-NEXT: v_readlane_b32 s17, v61, 29 +; SI-NEXT: v_readlane_b32 s94, v61, 26 +; SI-NEXT: v_readlane_b32 s95, v61, 27 +; SI-NEXT: v_readlane_b32 s30, v61, 24 +; SI-NEXT: v_readlane_b32 s31, v61, 25 +; SI-NEXT: v_readlane_b32 s18, v61, 22 +; SI-NEXT: v_readlane_b32 s19, v61, 23 +; SI-NEXT: v_readlane_b32 s34, v61, 20 +; SI-NEXT: v_readlane_b32 s35, v61, 21 +; SI-NEXT: s_mov_b32 s96, s36 +; SI-NEXT: v_readlane_b32 s36, v61, 18 +; SI-NEXT: v_readlane_b32 s37, v61, 19 ; SI-NEXT: v_readlane_b32 s20, v61, 16 ; SI-NEXT: v_readlane_b32 s21, v61, 17 ; SI-NEXT: v_readlane_b32 s22, v61, 14 @@ -45635,122 +45450,97 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_readlane_b32 vcc_hi, v61, 1 ; SI-NEXT: .LBB37_5: ; %end ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v48, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 ; SI-NEXT: s_lshl_b32 s4, s4, 8 -; SI-NEXT: v_or_b32_e32 v48, s4, v48 +; SI-NEXT: v_or_b32_e32 v49, s4, v49 ; SI-NEXT: s_and_b32 s4, s98, 0xff -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, vcc_lo, 24 -; SI-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v37, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v48, 0xff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; SI-NEXT: v_or_b32_e32 v48, s4, v48 -; SI-NEXT: v_or_b32_e32 v10, v37, v10 -; SI-NEXT: v_and_b32_e32 v37, 0xff, v55 -; SI-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v54 -; SI-NEXT: v_or_b32_e32 v37, v48, v37 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v10, v10, v37 -; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v10, v37, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v36 -; SI-NEXT: s_lshl_b32 s4, s56, 8 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s86, 0xff -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s84, 24 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: v_add_i32_e32 v36, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v53 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v10, v10, v32 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, vcc_lo, 24 +; SI-NEXT: v_or_b32_e32 v10, v48, v10 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v52 -; SI-NEXT: v_or_b32_e32 v8, v32, v8 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v55 +; SI-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v8, v48, v8 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v49, s4, v49 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v31 -; SI-NEXT: s_lshl_b32 s4, s46, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v39 +; SI-NEXT: s_lshl_b32 s4, s56, 8 ; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s82, 0xff +; SI-NEXT: s_and_b32 s4, s86, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s80, 24 +; SI-NEXT: s_lshl_b32 s5, s84, 24 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: v_add_i32_e32 v10, vcc, 16, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v54 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v50 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v27 -; SI-NEXT: s_lshl_b32 s4, s44, 8 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v34 +; SI-NEXT: s_lshl_b32 s4, s46, 8 ; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s70, 0xff +; SI-NEXT: s_and_b32 s4, s82, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s68, 24 +; SI-NEXT: s_lshl_b32 s5, s80, 24 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v25 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v32 ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v39 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v51 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v24 -; SI-NEXT: s_lshl_b32 s4, s42, 8 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v29 +; SI-NEXT: s_lshl_b32 s4, s44, 8 ; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s66, 0xff +; SI-NEXT: s_and_b32 s4, s70, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s64, 24 +; SI-NEXT: s_lshl_b32 s5, s68, 24 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v38 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v50 ; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_lshl_b32 s4, s40, 8 -; SI-NEXT: s_lshl_b32 s5, s52, 24 +; SI-NEXT: s_lshl_b32 s4, s42, 8 +; SI-NEXT: s_lshl_b32 s5, s64, 24 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 @@ -45768,12 +45558,10 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_readlane_b32 s69, v63, 21 ; SI-NEXT: v_readlane_b32 s68, v63, 20 ; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s66, v63, 18 ; SI-NEXT: v_readlane_b32 s65, v63, 17 ; SI-NEXT: v_readlane_b32 s64, v63, 16 ; SI-NEXT: v_readlane_b32 s55, v63, 15 ; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s52, v63, 12 ; SI-NEXT: v_readlane_b32 s51, v63, 11 ; SI-NEXT: v_readlane_b32 s49, v63, 9 ; SI-NEXT: v_readlane_b32 s39, v63, 7 @@ -45784,30 +45572,62 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: v_or_b32_e32 v2, v6, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: s_and_b32 s4, s66, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v23 +; SI-NEXT: s_lshl_b32 s4, s40, 8 +; SI-NEXT: s_lshl_b32 s5, s54, 24 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s54, 0xff +; SI-NEXT: s_and_b32 s4, s52, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 ; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 ; SI-NEXT: s_lshl_b32 s4, s24, 8 ; SI-NEXT: s_lshl_b32 s5, s48, 24 -; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s52, v63, 12 ; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 @@ -45828,21 +45648,21 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v2, s4, v2 ; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v19 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s4, s22, 8 -; SI-NEXT: s_lshl_b32 s5, s36, 24 +; SI-NEXT: s_lshl_b32 s5, s96, 24 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s96, v63, 32 ; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: v_readlane_b32 s36, v63, 4 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -45860,18 +45680,18 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v2, s4, v2 ; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v17 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s4, s20, 8 -; SI-NEXT: s_lshl_b32 s5, s30, 24 +; SI-NEXT: s_lshl_b32 s5, s34, 24 ; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: v_readlane_b32 s34, v63, 2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -45885,30 +45705,26 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v16 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s34, 0xff +; SI-NEXT: s_and_b32 s4, s36, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 ; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v37 ; SI-NEXT: s_lshl_b32 s4, s18, 8 ; SI-NEXT: s_lshl_b32 s5, s94, 24 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 @@ -45916,25 +45732,20 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s96, 0xff +; SI-NEXT: s_and_b32 s4, s30, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v34 -; SI-NEXT: s_lshl_b32 s4, s16, 8 -; SI-NEXT: s_lshl_b32 s5, s90, 24 -; SI-NEXT: v_readlane_b32 s96, v63, 32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v36 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v31 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -45942,9 +45753,11 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 +; SI-NEXT: s_lshl_b32 s4, s16, 8 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 ; SI-NEXT: s_and_b32 s4, s92, 0xff ; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s90, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 @@ -45952,11 +45765,11 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v26 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v28 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v25 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -45968,6 +45781,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: s_lshl_b32 s5, s78, 24 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v59 +; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 @@ -45979,7 +45793,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v24 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -46118,8 +45932,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v63, s30, 0 ; VI-NEXT: v_writelane_b32 v63, s31, 1 @@ -46356,22 +46170,22 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: s_lshr_b32 s70, s57, 8 ; VI-NEXT: s_lshr_b32 s69, s56, 16 ; VI-NEXT: s_lshr_b32 s71, s56, 8 -; VI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 -; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 -; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[46:47], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[56:57], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[46:47], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[56:57], 24 ; VI-NEXT: s_cbranch_execnz .LBB37_4 ; VI-NEXT: .LBB37_2: ; %cmp.true ; VI-NEXT: v_add_f32_e64 v2, s5, 1.0 @@ -46389,536 +46203,537 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] ; VI-NEXT: v_add_f32_e64 v8, s11, 1.0 ; VI-NEXT: v_add_f32_e64 v7, s10, 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] ; VI-NEXT: v_add_f32_e64 v10, s13, 1.0 ; VI-NEXT: v_add_f32_e64 v9, s12, 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] ; VI-NEXT: v_add_f32_e64 v12, s15, 1.0 ; VI-NEXT: v_add_f32_e64 v11, s14, 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] ; VI-NEXT: v_add_f32_e64 v14, s17, 1.0 ; VI-NEXT: v_add_f32_e64 v13, s16, 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] ; VI-NEXT: v_add_f32_e64 v16, s19, 1.0 ; VI-NEXT: v_add_f32_e64 v15, s18, 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] -; VI-NEXT: v_add_f32_e64 v18, s21, 1.0 -; VI-NEXT: v_add_f32_e64 v17, s20, 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] -; VI-NEXT: v_add_f32_e64 v20, s23, 1.0 -; VI-NEXT: v_add_f32_e64 v19, s22, 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] -; VI-NEXT: v_add_f32_e64 v22, s25, 1.0 -; VI-NEXT: v_add_f32_e64 v21, s24, 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v7 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v9 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_add_f32_e64 v18, s21, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_add_f32_e64 v17, s20, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_add_f32_e64 v20, s23, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_add_f32_e64 v19, s22, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_add_f32_e64 v22, s25, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_add_f32_e64 v21, s24, 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_add_f32_e64 v24, s41, 1.0 +; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; VI-NEXT: v_add_f32_e64 v28, s45, 1.0 ; VI-NEXT: v_add_f32_e64 v27, s44, 1.0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: v_lshrrev_b64 v[51:52], 24, v[19:20] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; VI-NEXT: v_add_f32_e64 v30, s47, 1.0 ; VI-NEXT: v_add_f32_e64 v29, s46, 1.0 -; VI-NEXT: v_add_f32_e64 v24, s41, 1.0 ; VI-NEXT: v_add_f32_e64 v23, s40, 1.0 +; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22] ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; VI-NEXT: v_add_f32_e64 v32, s57, 1.0 ; VI-NEXT: v_add_f32_e64 v31, s56, 1.0 ; VI-NEXT: v_add_f32_e64 v26, s43, 1.0 ; VI-NEXT: v_add_f32_e64 v25, s42, 1.0 ; VI-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 ; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v24 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v26 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v26 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v25 -; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v28 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v28 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v27 -; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v30 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v30 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v29 -; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v32 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v23 +; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v26 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v28 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v30 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v29 +; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v31 ; VI-NEXT: s_branch .LBB37_5 ; VI-NEXT: .LBB37_3: -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 ; VI-NEXT: ; implicit-def: $sgpr71 ; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr70 ; VI-NEXT: ; implicit-def: $sgpr68 ; VI-NEXT: ; implicit-def: $sgpr67 ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr65 ; VI-NEXT: ; implicit-def: $sgpr55 ; VI-NEXT: ; implicit-def: $sgpr54 ; VI-NEXT: ; implicit-def: $sgpr53 ; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr50 ; VI-NEXT: ; implicit-def: $sgpr87 ; VI-NEXT: ; implicit-def: $sgpr86 ; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr85 ; VI-NEXT: ; implicit-def: $sgpr83 ; VI-NEXT: ; implicit-def: $sgpr82 ; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: ; implicit-def: $sgpr80 -; VI-NEXT: ; implicit-def: $sgpr48 -; VI-NEXT: ; implicit-def: $sgpr38 -; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr30 -; VI-NEXT: ; implicit-def: $sgpr90 -; VI-NEXT: ; implicit-def: $sgpr88 -; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr28 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 ; VI-NEXT: s_branch .LBB37_2 ; VI-NEXT: .LBB37_4: -; VI-NEXT: v_mov_b32_e32 v53, s26 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s28 +; VI-NEXT: v_mov_b32_e32 v50, s48 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s38 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 0 -; VI-NEXT: v_mov_b32_e32 v48, s4 +; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s80 ; VI-NEXT: v_readlane_b32 s4, v62, 1 -; VI-NEXT: v_mov_b32_e32 v36, s4 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 2 -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v36, s4 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 3 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 4 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 5 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 6 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 7 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 8 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 9 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 10 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 11 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 12 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 13 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 14 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 15 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 16 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 17 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 18 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 19 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 20 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 21 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 22 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 23 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 24 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s36 ; VI-NEXT: v_readlane_b32 s4, v62, 25 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 26 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s58 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 27 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 28 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 29 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 30 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 31 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 32 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 33 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 34 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 35 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 36 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 37 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 38 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 39 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 40 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 41 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 42 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 43 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 44 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 45 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 46 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 47 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 48 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 49 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 50 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s34 ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 51 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s60 ; VI-NEXT: v_readlane_b32 s4, v62, 52 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 53 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 54 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 55 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 56 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 57 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_mov_b32_e32 v31, s56 ; VI-NEXT: v_mov_b32_e32 v32, s57 @@ -46951,124 +46766,110 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v3, s6 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_mov_b32_e32 v35, s71 -; VI-NEXT: v_mov_b32_e32 v61, s69 -; VI-NEXT: v_mov_b32_e32 v34, s70 -; VI-NEXT: v_mov_b32_e32 v33, s68 -; VI-NEXT: v_mov_b32_e32 v60, s67 -; VI-NEXT: v_mov_b32_e32 v52, s66 -; VI-NEXT: v_mov_b32_e32 v59, s64 -; VI-NEXT: v_mov_b32_e32 v58, s65 -; VI-NEXT: v_mov_b32_e32 v57, s55 -; VI-NEXT: v_mov_b32_e32 v49, s54 -; VI-NEXT: v_mov_b32_e32 v47, s53 -; VI-NEXT: v_mov_b32_e32 v56, s51 -; VI-NEXT: v_mov_b32_e32 v38, s52 -; VI-NEXT: v_mov_b32_e32 v51, s50 -; VI-NEXT: v_mov_b32_e32 v46, s87 -; VI-NEXT: v_mov_b32_e32 v44, s86 -; VI-NEXT: v_mov_b32_e32 v45, s84 -; VI-NEXT: v_mov_b32_e32 v43, s85 -; VI-NEXT: v_mov_b32_e32 v55, s83 -; VI-NEXT: v_mov_b32_e32 v42, s82 -; VI-NEXT: v_mov_b32_e32 v37, s81 -; VI-NEXT: v_mov_b32_e32 v50, s80 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v39, s36 -; VI-NEXT: v_mov_b32_e32 v40, s38 -; VI-NEXT: v_mov_b32_e32 v41, s48 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v56, s71 +; VI-NEXT: v_mov_b32_e32 v47, s69 +; VI-NEXT: v_mov_b32_e32 v46, s70 +; VI-NEXT: v_mov_b32_e32 v44, s68 +; VI-NEXT: v_mov_b32_e32 v45, s67 +; VI-NEXT: v_mov_b32_e32 v49, s66 +; VI-NEXT: v_mov_b32_e32 v38, s64 +; VI-NEXT: v_mov_b32_e32 v43, s65 +; VI-NEXT: v_mov_b32_e32 v55, s55 +; VI-NEXT: v_mov_b32_e32 v42, s54 +; VI-NEXT: v_mov_b32_e32 v36, s53 +; VI-NEXT: v_mov_b32_e32 v34, s51 +; VI-NEXT: v_mov_b32_e32 v61, s52 +; VI-NEXT: v_mov_b32_e32 v60, s50 +; VI-NEXT: v_mov_b32_e32 v33, s87 +; VI-NEXT: v_mov_b32_e32 v48, s86 +; VI-NEXT: v_mov_b32_e32 v59, s84 +; VI-NEXT: v_mov_b32_e32 v37, s85 +; VI-NEXT: v_mov_b32_e32 v57, s83 +; VI-NEXT: v_mov_b32_e32 v58, s82 +; VI-NEXT: v_mov_b32_e32 v35, s81 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v52, s72 ; VI-NEXT: v_mov_b32_e32 v53, s62 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s72 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s74 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s76 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s78 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s88 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s90 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s30 -; VI-NEXT: v_mov_b32_e32 v54, s34 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s30 +; VI-NEXT: v_mov_b32_e32 v54, s60 +; VI-NEXT: v_mov_b32_e32 v39, s58 +; VI-NEXT: v_mov_b32_e32 v40, s28 +; VI-NEXT: v_mov_b32_e32 v41, s26 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s90 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s88 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s78 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, s76 +; VI-NEXT: v_mov_b32_e32 v51, s74 ; VI-NEXT: .LBB37_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v34 -; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; VI-NEXT: v_or_b32_sdwa v32, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v41 -; VI-NEXT: v_or_b32_sdwa v31, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v34, v61, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v56 +; VI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 +; VI-NEXT: v_or_b32_sdwa v31, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v41, v47, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v60 -; VI-NEXT: v_or_b32_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v46 +; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v45 +; VI-NEXT: v_or_b32_sdwa v32, v44, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v32, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v49 +; VI-NEXT: v_or_b32_sdwa v29, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v40 -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v52 -; VI-NEXT: v_or_b32_sdwa v31, v59, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v38, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v31, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v58 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v43 ; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v49 -; VI-NEXT: v_or_b32_sdwa v30, v57, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v42 +; VI-NEXT: v_or_b32_sdwa v30, v55, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v30, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v36 +; VI-NEXT: v_or_b32_sdwa v27, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v39 -; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v47 -; VI-NEXT: v_or_b32_sdwa v29, v56, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v34, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v29, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v38 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v61 ; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v46 -; VI-NEXT: v_or_b32_sdwa v28, v51, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v33 +; VI-NEXT: v_or_b32_sdwa v28, v60, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v28, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v48 +; VI-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v54 -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v44 -; VI-NEXT: v_or_b32_sdwa v27, v45, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v27, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v43 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v37 ; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v42 -; VI-NEXT: v_or_b32_sdwa v26, v55, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v58 +; VI-NEXT: v_or_b32_sdwa v26, v57, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v26, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v35 +; VI-NEXT: v_or_b32_sdwa v23, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v53 -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v37 -; VI-NEXT: v_or_b32_sdwa v25, v48, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v25, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v50 -; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v36 ; VI-NEXT: v_readlane_b32 s87, v63, 31 ; VI-NEXT: v_readlane_b32 s86, v63, 30 ; VI-NEXT: v_readlane_b32 s85, v63, 29 @@ -47102,147 +46903,150 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v25, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 ; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v24, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_or_b32_sdwa v21, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v52 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 -; VI-NEXT: v_or_b32_sdwa v21, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v23, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v22, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_or_b32_sdwa v19, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; VI-NEXT: v_or_b32_sdwa v19, v19, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v21, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v20, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v50 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; VI-NEXT: v_or_b32_sdwa v17, v17, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v19, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -47250,25 +47054,25 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -47276,25 +47080,25 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -47302,25 +47106,25 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -47328,25 +47132,25 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; VI-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -47354,25 +47158,25 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -47380,13 +47184,13 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -47407,8 +47211,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -47417,8 +47221,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v63, s30, 0 ; GFX9-NEXT: v_writelane_b32 v63, s31, 1 @@ -47651,629 +47455,584 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: s_lshr_b32 s80, s57, 8 ; GFX9-NEXT: s_lshr_b32 s71, s56, 16 ; GFX9-NEXT: s_lshr_b32 s81, s56, 8 -; GFX9-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 -; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 -; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[56:57], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB37_4 ; GFX9-NEXT: .LBB37_2: ; %cmp.true ; GFX9-NEXT: v_add_f32_e64 v2, s5, 1.0 ; GFX9-NEXT: v_add_f32_e64 v1, s4, 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] ; GFX9-NEXT: v_add_f32_e64 v4, s7, 1.0 ; GFX9-NEXT: v_add_f32_e64 v3, s6, 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] ; GFX9-NEXT: v_add_f32_e64 v6, s9, 1.0 ; GFX9-NEXT: v_add_f32_e64 v5, s8, 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] ; GFX9-NEXT: v_add_f32_e64 v8, s11, 1.0 ; GFX9-NEXT: v_add_f32_e64 v7, s10, 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] ; GFX9-NEXT: v_add_f32_e64 v10, s13, 1.0 ; GFX9-NEXT: v_add_f32_e64 v9, s12, 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] ; GFX9-NEXT: v_add_f32_e64 v12, s15, 1.0 ; GFX9-NEXT: v_add_f32_e64 v11, s14, 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] ; GFX9-NEXT: v_add_f32_e64 v14, s17, 1.0 ; GFX9-NEXT: v_add_f32_e64 v13, s16, 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] -; GFX9-NEXT: v_add_f32_e64 v23, s19, 1.0 -; GFX9-NEXT: v_add_f32_e64 v22, s18, 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; GFX9-NEXT: v_add_f32_e64 v16, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v15, s18, 1.0 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[22:23] -; GFX9-NEXT: v_add_f32_e64 v25, s21, 1.0 -; GFX9-NEXT: v_add_f32_e64 v24, s20, 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[24:25] -; GFX9-NEXT: v_add_f32_e64 v27, s23, 1.0 -; GFX9-NEXT: v_add_f32_e64 v26, s22, 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[26:27] -; GFX9-NEXT: v_add_f32_e64 v29, s25, 1.0 -; GFX9-NEXT: v_add_f32_e64 v28, s24, 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[28:29] -; GFX9-NEXT: v_add_f32_e64 v31, s41, 1.0 -; GFX9-NEXT: v_add_f32_e64 v30, s40, 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[30:31] -; GFX9-NEXT: v_add_f32_e64 v33, s43, 1.0 -; GFX9-NEXT: v_add_f32_e64 v32, s42, 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[32:33] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v2 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v2 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v1 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v3 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v6 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v6 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v5 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v8 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v8 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v8 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v7 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v10 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v10 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v9 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v12 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v11 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v14 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v14 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v13 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v23 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v22 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v25 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v27 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v27 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v29 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v29 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v31 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v31 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30 -; GFX9-NEXT: v_add_f32_e64 v35, s45, 1.0 -; GFX9-NEXT: v_add_f32_e64 v34, s44, 1.0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v33 -; GFX9-NEXT: v_add_f32_e64 v37, s47, 1.0 -; GFX9-NEXT: v_add_f32_e64 v36, s46, 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[34:35] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v33 -; GFX9-NEXT: v_add_f32_e64 v39, s57, 1.0 -; GFX9-NEXT: v_add_f32_e64 v38, s56, 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[36:37] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v32 -; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[38:39] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v35 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v36 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v36 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 24, v39 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v39 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v39 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v38 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v1 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: v_add_f32_e64 v18, s21, 1.0 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_add_f32_e64 v17, s20, 1.0 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_add_f32_e64 v20, s23, 1.0 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_add_f32_e64 v19, s22, 1.0 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: v_add_f32_e64 v22, s25, 1.0 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_add_f32_e64 v21, s24, 1.0 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_add_f32_e64 v24, s41, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_add_f32_e64 v28, s45, 1.0 +; GFX9-NEXT: v_add_f32_e64 v27, s44, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: v_add_f32_e64 v30, s47, 1.0 +; GFX9-NEXT: v_add_f32_e64 v29, s46, 1.0 +; GFX9-NEXT: v_add_f32_e64 v23, s40, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: v_add_f32_e64 v32, s57, 1.0 +; GFX9-NEXT: v_add_f32_e64 v31, s56, 1.0 +; GFX9-NEXT: v_add_f32_e64 v26, s43, 1.0 +; GFX9-NEXT: v_add_f32_e64 v25, s42, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v31 ; GFX9-NEXT: s_branch .LBB37_5 ; GFX9-NEXT: .LBB37_3: -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 ; GFX9-NEXT: ; implicit-def: $sgpr81 ; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr80 ; GFX9-NEXT: ; implicit-def: $sgpr70 ; GFX9-NEXT: ; implicit-def: $sgpr69 ; GFX9-NEXT: ; implicit-def: $sgpr68 ; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr67 ; GFX9-NEXT: ; implicit-def: $sgpr65 ; GFX9-NEXT: ; implicit-def: $sgpr64 ; GFX9-NEXT: ; implicit-def: $sgpr55 ; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr58 ; GFX9-NEXT: ; implicit-def: $sgpr54 ; GFX9-NEXT: ; implicit-def: $sgpr52 ; GFX9-NEXT: ; implicit-def: $sgpr51 ; GFX9-NEXT: ; implicit-def: $sgpr50 ; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr49 ; GFX9-NEXT: ; implicit-def: $sgpr39 ; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr99 ; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr98 ; GFX9-NEXT: ; implicit-def: $sgpr96 ; GFX9-NEXT: ; implicit-def: $sgpr87 ; GFX9-NEXT: ; implicit-def: $sgpr86 ; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr72 ; GFX9-NEXT: ; implicit-def: $sgpr85 ; GFX9-NEXT: ; implicit-def: $sgpr83 ; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: ; implicit-def: $sgpr36 -; GFX9-NEXT: ; implicit-def: $sgpr34 -; GFX9-NEXT: ; implicit-def: $sgpr30 -; GFX9-NEXT: ; implicit-def: $sgpr94 -; GFX9-NEXT: ; implicit-def: $sgpr92 -; GFX9-NEXT: ; implicit-def: $sgpr90 -; GFX9-NEXT: ; implicit-def: $sgpr88 -; GFX9-NEXT: ; implicit-def: $sgpr78 -; GFX9-NEXT: ; implicit-def: $sgpr76 ; GFX9-NEXT: ; implicit-def: $sgpr74 -; GFX9-NEXT: ; implicit-def: $sgpr72 -; GFX9-NEXT: ; implicit-def: $sgpr62 -; GFX9-NEXT: ; implicit-def: $sgpr60 -; GFX9-NEXT: ; implicit-def: $sgpr58 -; GFX9-NEXT: ; implicit-def: $sgpr28 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 ; GFX9-NEXT: s_branch .LBB37_2 ; GFX9-NEXT: .LBB37_4: -; GFX9-NEXT: v_mov_b32_e32 v52, s48 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s39 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s38 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s97 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s96 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s87 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s84 +; GFX9-NEXT: v_mov_b32_e32 v50, s36 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v50, s34 +; GFX9-NEXT: v_mov_b32_e32 v39, s97 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s98 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s96 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s87 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s86 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s84 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s85 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s83 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s83 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s82 ; GFX9-NEXT: v_readlane_b32 s4, v62, 0 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s82 -; GFX9-NEXT: v_mov_b32_e32 v46, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 1 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 2 -; GFX9-NEXT: v_mov_b32_e32 v45, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 3 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 4 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 5 -; GFX9-NEXT: v_mov_b32_e32 v44, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 6 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 7 -; GFX9-NEXT: v_mov_b32_e32 v43, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 8 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 9 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 10 -; GFX9-NEXT: v_mov_b32_e32 v55, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 11 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 12 -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v52, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 13 -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 14 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 15 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 16 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 17 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 18 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 19 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v50, s30 +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 20 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 21 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 22 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 23 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 24 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 25 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 26 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 27 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 28 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 29 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 31 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 32 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 33 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 34 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 35 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 36 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 37 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 38 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 39 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 40 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 41 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 42 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 43 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 44 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 45 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v50, s94 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 46 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 47 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 48 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 49 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s4 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s26 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s28 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s58 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s60 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s62 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s72 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s74 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s76 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s78 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s88 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s90 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s92 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s94 -; GFX9-NEXT: v_mov_b32_e32 v49, s52 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v38, s56 -; GFX9-NEXT: v_mov_b32_e32 v39, s57 -; GFX9-NEXT: v_mov_b32_e32 v36, s46 -; GFX9-NEXT: v_mov_b32_e32 v37, s47 -; GFX9-NEXT: v_mov_b32_e32 v34, s44 -; GFX9-NEXT: v_mov_b32_e32 v35, s45 -; GFX9-NEXT: v_mov_b32_e32 v32, s42 -; GFX9-NEXT: v_mov_b32_e32 v33, s43 -; GFX9-NEXT: v_mov_b32_e32 v30, s40 -; GFX9-NEXT: v_mov_b32_e32 v31, s41 -; GFX9-NEXT: v_mov_b32_e32 v28, s24 -; GFX9-NEXT: v_mov_b32_e32 v29, s25 -; GFX9-NEXT: v_mov_b32_e32 v26, s22 -; GFX9-NEXT: v_mov_b32_e32 v27, s23 -; GFX9-NEXT: v_mov_b32_e32 v24, s20 -; GFX9-NEXT: v_mov_b32_e32 v25, s21 -; GFX9-NEXT: v_mov_b32_e32 v22, s18 -; GFX9-NEXT: v_mov_b32_e32 v23, s19 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v39, s4 +; GFX9-NEXT: v_mov_b32_e32 v31, s56 +; GFX9-NEXT: v_mov_b32_e32 v32, s57 +; GFX9-NEXT: v_mov_b32_e32 v29, s46 +; GFX9-NEXT: v_mov_b32_e32 v30, s47 +; GFX9-NEXT: v_mov_b32_e32 v27, s44 +; GFX9-NEXT: v_mov_b32_e32 v28, s45 +; GFX9-NEXT: v_mov_b32_e32 v25, s42 +; GFX9-NEXT: v_mov_b32_e32 v26, s43 +; GFX9-NEXT: v_mov_b32_e32 v23, s40 +; GFX9-NEXT: v_mov_b32_e32 v24, s41 +; GFX9-NEXT: v_mov_b32_e32 v21, s24 +; GFX9-NEXT: v_mov_b32_e32 v22, s25 +; GFX9-NEXT: v_mov_b32_e32 v19, s22 +; GFX9-NEXT: v_mov_b32_e32 v20, s23 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 ; GFX9-NEXT: v_mov_b32_e32 v13, s16 ; GFX9-NEXT: v_mov_b32_e32 v14, s17 ; GFX9-NEXT: v_mov_b32_e32 v11, s14 @@ -48287,89 +48046,108 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v48, s81 -; GFX9-NEXT: v_mov_b32_e32 v21, s71 -; GFX9-NEXT: v_mov_b32_e32 v16, s80 -; GFX9-NEXT: v_mov_b32_e32 v19, s70 -; GFX9-NEXT: v_mov_b32_e32 v20, s69 -; GFX9-NEXT: v_mov_b32_e32 v15, s68 -; GFX9-NEXT: v_mov_b32_e32 v18, s66 -; GFX9-NEXT: v_mov_b32_e32 v61, s67 -; GFX9-NEXT: v_mov_b32_e32 v51, s65 -; GFX9-NEXT: v_mov_b32_e32 v17, s64 -; GFX9-NEXT: v_mov_b32_e32 v54, s55 -; GFX9-NEXT: v_mov_b32_e32 v50, s53 -; GFX9-NEXT: v_mov_b32_e32 v60, s54 -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v49, s51 -; GFX9-NEXT: v_mov_b32_e32 v59, s50 -; GFX9-NEXT: v_mov_b32_e32 v58, s49 -; GFX9-NEXT: v_mov_b32_e32 v57, s99 -; GFX9-NEXT: v_mov_b32_e32 v53, s98 -; GFX9-NEXT: v_mov_b32_e32 v56, s86 -; GFX9-NEXT: v_mov_b32_e32 v47, s85 -; GFX9-NEXT: v_mov_b32_e32 v40, s30 -; GFX9-NEXT: v_mov_b32_e32 v41, s34 -; GFX9-NEXT: v_mov_b32_e32 v42, s36 +; GFX9-NEXT: v_mov_b32_e32 v47, s81 +; GFX9-NEXT: v_mov_b32_e32 v46, s71 +; GFX9-NEXT: v_mov_b32_e32 v38, s80 +; GFX9-NEXT: v_mov_b32_e32 v49, s70 +; GFX9-NEXT: v_mov_b32_e32 v45, s69 +; GFX9-NEXT: v_mov_b32_e32 v35, s68 +; GFX9-NEXT: v_mov_b32_e32 v44, s66 +; GFX9-NEXT: v_mov_b32_e32 v43, s67 +; GFX9-NEXT: v_mov_b32_e32 v55, s65 +; GFX9-NEXT: v_mov_b32_e32 v42, s64 +; GFX9-NEXT: v_mov_b32_e32 v36, s55 +; GFX9-NEXT: v_mov_b32_e32 v48, s53 +; GFX9-NEXT: v_mov_b32_e32 v34, s54 +; GFX9-NEXT: v_mov_b32_e32 v33, s52 +; GFX9-NEXT: v_mov_b32_e32 v61, s51 +; GFX9-NEXT: v_mov_b32_e32 v60, s50 +; GFX9-NEXT: v_mov_b32_e32 v59, s48 +; GFX9-NEXT: v_mov_b32_e32 v37, s49 +; GFX9-NEXT: v_mov_b32_e32 v57, s39 +; GFX9-NEXT: v_mov_b32_e32 v58, s38 +; GFX9-NEXT: v_mov_b32_e32 v56, s99 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v52, s72 +; GFX9-NEXT: v_mov_b32_e32 v53, s62 +; GFX9-NEXT: v_mov_b32_e32 v54, s60 +; GFX9-NEXT: v_mov_b32_e32 v39, s58 +; GFX9-NEXT: v_mov_b32_e32 v40, s28 +; GFX9-NEXT: v_mov_b32_e32 v41, s26 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v50, s92 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v50, s90 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v50, s88 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v50, s78 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v50, s76 +; GFX9-NEXT: v_mov_b32_e32 v51, s74 ; GFX9-NEXT: .LBB37_5: ; %end -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v15, v36, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v54 -; GFX9-NEXT: v_or_b32_sdwa v34, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v35, v35, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v58 -; GFX9-NEXT: v_or_b32_sdwa v33, v33, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v57 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v53 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; GFX9-NEXT: v_or_b32_sdwa v16, v39, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v56 -; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v47 -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v29, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v46 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v41 -; GFX9-NEXT: v_or_b32_sdwa v26, v26, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v44 -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v24, v24, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v43 -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v17 -; GFX9-NEXT: v_or_b32_sdwa v25, v25, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v55 -; GFX9-NEXT: v_or_b32_sdwa v15, v51, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v22, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v52 -; GFX9-NEXT: v_or_b32_sdwa v15, v36, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v48 -; GFX9-NEXT: v_or_b32_sdwa v23, v23, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v42 -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v40 -; GFX9-NEXT: v_or_b32_sdwa v38, v38, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v21, v21, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v50, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v34, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v49 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v47 +; GFX9-NEXT: v_lshlrev_b32_e32 v41, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v41, v46, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v32, v49, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v31, v44, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v42 +; GFX9-NEXT: v_or_b32_sdwa v30, v55, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v29, v48, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v28, v33, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v27, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v26, v57, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v53 ; GFX9-NEXT: v_readlane_b32 s99, v63, 35 ; GFX9-NEXT: v_readlane_b32 s98, v63, 34 ; GFX9-NEXT: v_readlane_b32 s97, v63, 33 @@ -48407,293 +48185,284 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v30, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v52 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v28, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v50 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v12 ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -48715,8 +48484,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -48726,267 +48495,265 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 ; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:88 ; GFX11-NEXT: s_mov_b32 exec_lo, s4 -; GFX11-NEXT: v_writelane_b32 v75, s30, 0 -; GFX11-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-NEXT: v_writelane_b32 v73, s30, 0 +; GFX11-NEXT: v_writelane_b32 v74, s96, 0 ; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; GFX11-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 -; GFX11-NEXT: v_writelane_b32 v75, s31, 1 -; GFX11-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-NEXT: v_writelane_b32 v73, s31, 1 +; GFX11-NEXT: v_writelane_b32 v74, s97, 1 ; GFX11-NEXT: v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17 ; GFX11-NEXT: v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v23, s19 -; GFX11-NEXT: v_writelane_b32 v75, s34, 2 -; GFX11-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-NEXT: v_writelane_b32 v73, s34, 2 +; GFX11-NEXT: v_writelane_b32 v74, s98, 2 ; GFX11-NEXT: v_dual_mov_b32 v24, s20 :: v_dual_mov_b32 v25, s21 ; GFX11-NEXT: v_dual_mov_b32 v26, s22 :: v_dual_mov_b32 v27, s23 -; GFX11-NEXT: v_writelane_b32 v75, s35, 3 -; GFX11-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-NEXT: v_writelane_b32 v73, s35, 3 +; GFX11-NEXT: v_writelane_b32 v74, s99, 3 ; GFX11-NEXT: v_dual_mov_b32 v28, s24 :: v_dual_mov_b32 v29, s25 ; GFX11-NEXT: v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v31, s27 -; GFX11-NEXT: v_writelane_b32 v75, s36, 4 -; GFX11-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-NEXT: v_writelane_b32 v73, s36, 4 +; GFX11-NEXT: v_writelane_b32 v74, s100, 4 ; GFX11-NEXT: v_dual_mov_b32 v32, s28 :: v_dual_mov_b32 v33, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 -; GFX11-NEXT: v_writelane_b32 v75, s37, 5 -; GFX11-NEXT: v_writelane_b32 v76, s101, 5 +; GFX11-NEXT: v_writelane_b32 v73, s37, 5 +; GFX11-NEXT: v_writelane_b32 v74, s101, 5 ; GFX11-NEXT: v_readfirstlane_b32 s40, v16 ; GFX11-NEXT: v_readfirstlane_b32 s41, v17 ; GFX11-NEXT: v_readfirstlane_b32 s28, v18 -; GFX11-NEXT: v_writelane_b32 v75, s38, 6 -; GFX11-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-NEXT: v_writelane_b32 v73, s38, 6 +; GFX11-NEXT: v_writelane_b32 v74, s102, 6 ; GFX11-NEXT: v_readfirstlane_b32 s29, v19 ; GFX11-NEXT: v_readfirstlane_b32 s26, v20 ; GFX11-NEXT: v_readfirstlane_b32 s27, v21 -; GFX11-NEXT: v_writelane_b32 v75, s39, 7 -; GFX11-NEXT: v_writelane_b32 v76, s103, 7 +; GFX11-NEXT: v_writelane_b32 v73, s39, 7 +; GFX11-NEXT: v_writelane_b32 v74, s103, 7 ; GFX11-NEXT: v_readfirstlane_b32 s24, v22 ; GFX11-NEXT: v_readfirstlane_b32 s25, v23 ; GFX11-NEXT: v_readfirstlane_b32 s22, v24 -; GFX11-NEXT: v_writelane_b32 v75, s48, 8 +; GFX11-NEXT: v_writelane_b32 v73, s48, 8 ; GFX11-NEXT: v_readfirstlane_b32 s23, v25 ; GFX11-NEXT: v_readfirstlane_b32 s20, v26 ; GFX11-NEXT: v_readfirstlane_b32 s21, v27 ; GFX11-NEXT: v_readfirstlane_b32 s18, v28 -; GFX11-NEXT: v_writelane_b32 v75, s49, 9 +; GFX11-NEXT: v_writelane_b32 v73, s49, 9 ; GFX11-NEXT: v_readfirstlane_b32 s19, v29 ; GFX11-NEXT: v_readfirstlane_b32 s16, v30 ; GFX11-NEXT: v_readfirstlane_b32 s17, v31 ; GFX11-NEXT: v_readfirstlane_b32 s14, v32 -; GFX11-NEXT: v_writelane_b32 v75, s50, 10 +; GFX11-NEXT: v_writelane_b32 v73, s50, 10 ; GFX11-NEXT: v_readfirstlane_b32 s15, v33 ; GFX11-NEXT: v_readfirstlane_b32 s12, v1 ; GFX11-NEXT: v_readfirstlane_b32 s13, v2 ; GFX11-NEXT: v_readfirstlane_b32 s10, v3 -; GFX11-NEXT: v_writelane_b32 v75, s51, 11 +; GFX11-NEXT: v_writelane_b32 v73, s51, 11 ; GFX11-NEXT: v_readfirstlane_b32 s11, v4 ; GFX11-NEXT: v_readfirstlane_b32 s0, v5 ; GFX11-NEXT: v_readfirstlane_b32 s1, v6 ; GFX11-NEXT: v_readfirstlane_b32 s2, v7 -; GFX11-NEXT: v_writelane_b32 v75, s52, 12 +; GFX11-NEXT: v_writelane_b32 v73, s52, 12 ; GFX11-NEXT: v_readfirstlane_b32 s3, v8 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 ; GFX11-NEXT: v_readfirstlane_b32 s5, v10 ; GFX11-NEXT: v_readfirstlane_b32 s6, v11 -; GFX11-NEXT: v_writelane_b32 v75, s53, 13 +; GFX11-NEXT: v_writelane_b32 v73, s53, 13 ; GFX11-NEXT: v_readfirstlane_b32 s7, v12 ; GFX11-NEXT: v_readfirstlane_b32 s8, v13 ; GFX11-NEXT: v_readfirstlane_b32 s9, v14 ; GFX11-NEXT: s_mov_b32 vcc_hi, 0 -; GFX11-NEXT: v_writelane_b32 v75, s54, 14 +; GFX11-NEXT: v_writelane_b32 v73, s54, 14 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 -; GFX11-NEXT: v_writelane_b32 v76, s104, 8 -; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane -; GFX11-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane -; GFX11-NEXT: v_writelane_b32 v75, s55, 15 -; GFX11-NEXT: v_writelane_b32 v75, s64, 16 -; GFX11-NEXT: v_writelane_b32 v75, s65, 17 -; GFX11-NEXT: v_writelane_b32 v75, s66, 18 -; GFX11-NEXT: v_writelane_b32 v75, s67, 19 -; GFX11-NEXT: v_writelane_b32 v75, s68, 20 -; GFX11-NEXT: v_writelane_b32 v75, s69, 21 -; GFX11-NEXT: v_writelane_b32 v75, s70, 22 -; GFX11-NEXT: v_writelane_b32 v75, s71, 23 -; GFX11-NEXT: v_writelane_b32 v75, s80, 24 -; GFX11-NEXT: v_writelane_b32 v75, s81, 25 -; GFX11-NEXT: v_writelane_b32 v75, s82, 26 -; GFX11-NEXT: v_writelane_b32 v75, s83, 27 -; GFX11-NEXT: v_writelane_b32 v75, s84, 28 -; GFX11-NEXT: v_writelane_b32 v75, s85, 29 -; GFX11-NEXT: v_writelane_b32 v75, s86, 30 -; GFX11-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-NEXT: s_clause 0x10 ; 68-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 +; GFX11-NEXT: v_writelane_b32 v74, s104, 8 +; GFX11-NEXT: ; implicit-def: $vgpr75 : SGPR spill to VGPR lane +; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane +; GFX11-NEXT: v_writelane_b32 v73, s55, 15 +; GFX11-NEXT: v_writelane_b32 v73, s64, 16 +; GFX11-NEXT: v_writelane_b32 v73, s65, 17 +; GFX11-NEXT: v_writelane_b32 v73, s66, 18 +; GFX11-NEXT: v_writelane_b32 v73, s67, 19 +; GFX11-NEXT: v_writelane_b32 v73, s68, 20 +; GFX11-NEXT: v_writelane_b32 v73, s69, 21 +; GFX11-NEXT: v_writelane_b32 v73, s70, 22 +; GFX11-NEXT: v_writelane_b32 v73, s71, 23 +; GFX11-NEXT: v_writelane_b32 v73, s80, 24 +; GFX11-NEXT: v_writelane_b32 v73, s81, 25 +; GFX11-NEXT: v_writelane_b32 v73, s82, 26 +; GFX11-NEXT: v_writelane_b32 v73, s83, 27 +; GFX11-NEXT: v_writelane_b32 v73, s84, 28 +; GFX11-NEXT: v_writelane_b32 v73, s85, 29 +; GFX11-NEXT: v_writelane_b32 v73, s86, 30 +; GFX11-NEXT: v_writelane_b32 v73, s87, 31 ; GFX11-NEXT: s_cbranch_scc0 .LBB37_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b32 s42, s9, 24 ; GFX11-NEXT: s_lshr_b32 s36, s17, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-NEXT: v_writelane_b32 v76, s42, 8 ; GFX11-NEXT: s_lshr_b32 s42, s9, 16 ; GFX11-NEXT: s_lshr_b32 s38, s17, 8 ; GFX11-NEXT: s_lshr_b32 s37, s16, 16 ; GFX11-NEXT: s_lshr_b32 s39, s16, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-NEXT: v_writelane_b32 v76, s42, 7 ; GFX11-NEXT: s_lshr_b32 s42, s9, 8 ; GFX11-NEXT: s_lshr_b32 s48, s19, 24 ; GFX11-NEXT: s_lshr_b32 s49, s19, 16 ; GFX11-NEXT: s_lshr_b32 s51, s19, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-NEXT: v_writelane_b32 v76, s42, 6 ; GFX11-NEXT: s_lshr_b32 s42, s8, 16 ; GFX11-NEXT: s_lshr_b32 s50, s18, 16 ; GFX11-NEXT: s_lshr_b32 s52, s18, 8 ; GFX11-NEXT: s_lshr_b32 s53, s21, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-NEXT: v_writelane_b32 v76, s42, 5 ; GFX11-NEXT: s_lshr_b32 s42, s8, 8 ; GFX11-NEXT: s_lshr_b32 s54, s21, 16 ; GFX11-NEXT: s_lshr_b32 s64, s21, 8 ; GFX11-NEXT: s_lshr_b32 s55, s20, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-NEXT: v_writelane_b32 v76, s42, 4 ; GFX11-NEXT: s_lshr_b32 s42, s7, 24 ; GFX11-NEXT: s_lshr_b32 s65, s20, 8 ; GFX11-NEXT: s_lshr_b32 s66, s23, 24 ; GFX11-NEXT: s_lshr_b32 s67, s23, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 3 +; GFX11-NEXT: v_writelane_b32 v76, s42, 3 ; GFX11-NEXT: s_lshr_b32 s42, s7, 16 ; GFX11-NEXT: s_lshr_b32 s69, s23, 8 ; GFX11-NEXT: s_lshr_b32 s68, s22, 16 ; GFX11-NEXT: s_lshr_b32 s70, s22, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-NEXT: v_writelane_b32 v76, s42, 2 ; GFX11-NEXT: s_lshr_b32 s42, s7, 8 ; GFX11-NEXT: s_lshr_b32 s71, s25, 24 ; GFX11-NEXT: s_lshr_b32 s80, s25, 16 ; GFX11-NEXT: s_lshr_b32 s82, s25, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 1 +; GFX11-NEXT: v_writelane_b32 v76, s42, 1 ; GFX11-NEXT: s_lshr_b32 s42, s6, 16 ; GFX11-NEXT: s_lshr_b32 s81, s24, 16 ; GFX11-NEXT: s_lshr_b32 s83, s24, 8 ; GFX11-NEXT: s_lshr_b32 s84, s27, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-NEXT: v_writelane_b32 v76, s42, 0 ; GFX11-NEXT: s_lshr_b32 s42, s6, 8 ; GFX11-NEXT: s_lshr_b32 s85, s27, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 31 +; GFX11-NEXT: v_writelane_b32 v75, s42, 31 ; GFX11-NEXT: s_lshr_b32 s42, s5, 24 ; GFX11-NEXT: s_lshr_b32 s87, s27, 8 ; GFX11-NEXT: s_lshr_b32 s86, s26, 16 ; GFX11-NEXT: s_lshr_b32 s96, s26, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 30 +; GFX11-NEXT: v_writelane_b32 v75, s42, 30 ; GFX11-NEXT: s_lshr_b32 s42, s5, 16 ; GFX11-NEXT: s_lshr_b32 s97, s29, 24 ; GFX11-NEXT: s_lshr_b32 s98, s29, 16 ; GFX11-NEXT: s_lshr_b32 s100, s29, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 29 +; GFX11-NEXT: v_writelane_b32 v75, s42, 29 ; GFX11-NEXT: s_lshr_b32 s42, s5, 8 ; GFX11-NEXT: s_lshr_b32 s99, s28, 16 ; GFX11-NEXT: s_lshr_b32 s101, s28, 8 ; GFX11-NEXT: s_lshr_b32 s102, s41, 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 28 +; GFX11-NEXT: v_writelane_b32 v75, s42, 28 ; GFX11-NEXT: s_lshr_b32 s42, s4, 16 ; GFX11-NEXT: s_lshr_b32 s103, s41, 16 ; GFX11-NEXT: s_lshr_b32 s34, s41, 8 ; GFX11-NEXT: s_lshr_b32 s104, s40, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 27 +; GFX11-NEXT: v_writelane_b32 v75, s42, 27 ; GFX11-NEXT: s_lshr_b32 s42, s4, 8 ; GFX11-NEXT: s_lshr_b32 s35, s40, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b64 s[72:73], s[6:7], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 26 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[8:9], 24 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[6:7], 24 +; GFX11-NEXT: v_writelane_b32 v75, s42, 26 ; GFX11-NEXT: s_lshr_b32 s42, s3, 24 -; GFX11-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[2:3], 24 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 25 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[4:5], 24 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[2:3], 24 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[0:1], 24 +; GFX11-NEXT: v_writelane_b32 v75, s42, 25 ; GFX11-NEXT: s_lshr_b32 s42, s3, 16 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 24 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[10:11], 24 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[12:13], 24 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[14:15], 24 +; GFX11-NEXT: v_writelane_b32 v75, s42, 24 ; GFX11-NEXT: s_lshr_b32 s42, s3, 8 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[16:17], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[18:19], 24 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[16:17], 24 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[18:19], 24 ; GFX11-NEXT: s_lshr_b64 s[60:61], s[20:21], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 23 +; GFX11-NEXT: v_writelane_b32 v75, s42, 23 ; GFX11-NEXT: s_lshr_b32 s42, s2, 16 ; GFX11-NEXT: s_lshr_b64 s[58:59], s[22:23], 24 ; GFX11-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 ; GFX11-NEXT: s_lshr_b64 s[46:47], s[26:27], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 22 +; GFX11-NEXT: v_writelane_b32 v75, s42, 22 ; GFX11-NEXT: s_lshr_b32 s42, s2, 8 ; GFX11-NEXT: s_lshr_b64 s[44:45], s[28:29], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 21 +; GFX11-NEXT: v_writelane_b32 v75, s42, 21 ; GFX11-NEXT: s_lshr_b32 s42, s1, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 20 +; GFX11-NEXT: v_writelane_b32 v75, s42, 20 ; GFX11-NEXT: s_lshr_b32 s42, s1, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 19 +; GFX11-NEXT: v_writelane_b32 v75, s42, 19 ; GFX11-NEXT: s_lshr_b32 s42, s1, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 18 +; GFX11-NEXT: v_writelane_b32 v75, s42, 18 ; GFX11-NEXT: s_lshr_b32 s42, s0, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 17 +; GFX11-NEXT: v_writelane_b32 v75, s42, 17 ; GFX11-NEXT: s_lshr_b32 s42, s0, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 16 +; GFX11-NEXT: v_writelane_b32 v75, s42, 16 ; GFX11-NEXT: s_lshr_b32 s42, s11, 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 15 +; GFX11-NEXT: v_writelane_b32 v75, s42, 15 ; GFX11-NEXT: s_lshr_b32 s42, s11, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 14 +; GFX11-NEXT: v_writelane_b32 v75, s42, 14 ; GFX11-NEXT: s_lshr_b32 s42, s11, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 13 +; GFX11-NEXT: v_writelane_b32 v75, s42, 13 ; GFX11-NEXT: s_lshr_b32 s42, s10, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 12 +; GFX11-NEXT: v_writelane_b32 v75, s42, 12 ; GFX11-NEXT: s_lshr_b32 s42, s10, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 11 +; GFX11-NEXT: v_writelane_b32 v75, s42, 11 ; GFX11-NEXT: s_lshr_b32 s42, s13, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 10 +; GFX11-NEXT: v_writelane_b32 v75, s42, 10 ; GFX11-NEXT: s_lshr_b32 s42, s13, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 9 +; GFX11-NEXT: v_writelane_b32 v75, s42, 9 ; GFX11-NEXT: s_lshr_b32 s42, s13, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-NEXT: v_writelane_b32 v75, s42, 8 ; GFX11-NEXT: s_lshr_b32 s42, s12, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-NEXT: v_writelane_b32 v75, s42, 7 ; GFX11-NEXT: s_lshr_b32 s42, s12, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-NEXT: v_writelane_b32 v75, s42, 6 ; GFX11-NEXT: s_lshr_b32 s42, s15, 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-NEXT: v_writelane_b32 v75, s42, 5 ; GFX11-NEXT: s_lshr_b32 s42, s15, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-NEXT: v_writelane_b32 v75, s42, 4 ; GFX11-NEXT: s_lshr_b32 s42, s15, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-NEXT: v_writelane_b32 v75, s42, 3 ; GFX11-NEXT: s_lshr_b32 s42, s14, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-NEXT: v_writelane_b32 v75, s42, 2 ; GFX11-NEXT: s_lshr_b32 s42, s14, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-NEXT: v_writelane_b32 v75, s42, 1 ; GFX11-NEXT: s_lshr_b32 s42, s17, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-NEXT: v_writelane_b32 v75, s42, 0 ; GFX11-NEXT: s_lshr_b64 s[42:43], s[40:41], 24 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, vcc_hi ; GFX11-NEXT: s_cbranch_vccnz .LBB37_4 @@ -49046,79 +48813,79 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v55, 24, v4 ; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v86, 24, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v97, 8, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v99, 24, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v100, 8, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v102, 8, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v103, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v113, 8, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v114, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v115, 8, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v117, 24, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v116, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v118, 8, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v128, 8, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v129, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v131, 8, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v133, 8, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v135, 24, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v134, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v146, 8, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v147, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v149, 8, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v150, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v151, 8, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v160, 24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v161, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v162, 8, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v163, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v164, 8, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v165, 24, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v166, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v167, 8, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v177, 8, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v178, 24, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v179, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v180, 8, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v30 -; GFX11-NEXT: v_lshrrev_b32_e32 v182, 8, v30 -; GFX11-NEXT: v_lshrrev_b32_e32 v183, 24, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v40, 16, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v41, 8, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v42, 16, v34 -; GFX11-NEXT: v_lshrrev_b32_e32 v43, 8, v34 -; GFX11-NEXT: v_lshrrev_b32_e32 v44, 24, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v45, 16, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v46, 8, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v56, 8, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v57, 24, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v58, 16, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v59, 8, v49 -; GFX11-NEXT: v_lshrrev_b32_e32 v60, 16, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v61, 8, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v62, 24, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v63, 16, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v72, 8, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v73, 16, v52 -; GFX11-NEXT: v_lshrrev_b32_e32 v74, 8, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 24, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 8, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 24, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 24, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 24, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 24, v49 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 16, v49 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v49 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 16, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 8, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 24, v53 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v53 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v53 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 16, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 8, v52 ; GFX11-NEXT: s_branch .LBB37_5 ; GFX11-NEXT: .LBB37_3: ; GFX11-NEXT: ; implicit-def: $sgpr43 @@ -49163,23 +48930,23 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: ; implicit-def: $sgpr53 ; GFX11-NEXT: ; implicit-def: $sgpr52 ; GFX11-NEXT: ; implicit-def: $sgpr50 +; GFX11-NEXT: ; implicit-def: $sgpr62 ; GFX11-NEXT: ; implicit-def: $sgpr51 ; GFX11-NEXT: ; implicit-def: $sgpr49 ; GFX11-NEXT: ; implicit-def: $sgpr48 ; GFX11-NEXT: ; implicit-def: $sgpr39 ; GFX11-NEXT: ; implicit-def: $sgpr37 +; GFX11-NEXT: ; implicit-def: $sgpr72 ; GFX11-NEXT: ; implicit-def: $sgpr38 ; GFX11-NEXT: ; implicit-def: $sgpr36 -; GFX11-NEXT: ; implicit-def: $sgpr30 -; GFX11-NEXT: ; implicit-def: $sgpr94 -; GFX11-NEXT: ; implicit-def: $sgpr92 -; GFX11-NEXT: ; implicit-def: $sgpr90 -; GFX11-NEXT: ; implicit-def: $sgpr88 -; GFX11-NEXT: ; implicit-def: $sgpr78 -; GFX11-NEXT: ; implicit-def: $sgpr76 ; GFX11-NEXT: ; implicit-def: $sgpr74 -; GFX11-NEXT: ; implicit-def: $sgpr72 -; GFX11-NEXT: ; implicit-def: $sgpr62 +; GFX11-NEXT: ; implicit-def: $sgpr76 +; GFX11-NEXT: ; implicit-def: $sgpr78 +; GFX11-NEXT: ; implicit-def: $sgpr88 +; GFX11-NEXT: ; implicit-def: $sgpr90 +; GFX11-NEXT: ; implicit-def: $sgpr92 +; GFX11-NEXT: ; implicit-def: $sgpr94 +; GFX11-NEXT: ; implicit-def: $sgpr30 ; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; kill: killed $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr43 @@ -49261,250 +49028,250 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: s_branch .LBB37_2 ; GFX11-NEXT: .LBB37_4: ; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 -; GFX11-NEXT: v_readlane_b32 s0, v77, 0 -; GFX11-NEXT: v_dual_mov_b32 v147, s36 :: v_dual_mov_b32 v52, s40 +; GFX11-NEXT: v_readlane_b32 s0, v75, 0 +; GFX11-NEXT: v_dual_mov_b32 v145, s36 :: v_dual_mov_b32 v52, s40 ; GFX11-NEXT: v_dual_mov_b32 v53, s41 :: v_dual_mov_b32 v48, s28 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v49, s29 :: v_dual_mov_b32 v148, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 1 +; GFX11-NEXT: v_dual_mov_b32 v49, s29 :: v_dual_mov_b32 v146, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 1 ; GFX11-NEXT: v_dual_mov_b32 v36, s26 :: v_dual_mov_b32 v37, s27 ; GFX11-NEXT: v_dual_mov_b32 v34, s24 :: v_dual_mov_b32 v35, s25 -; GFX11-NEXT: v_mov_b32_e32 v146, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-NEXT: v_mov_b32_e32 v144, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 2 ; GFX11-NEXT: v_dual_mov_b32 v30, s22 :: v_dual_mov_b32 v31, s23 ; GFX11-NEXT: v_dual_mov_b32 v28, s20 :: v_dual_mov_b32 v29, s21 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v145, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 3 +; GFX11-NEXT: v_mov_b32_e32 v135, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 3 ; GFX11-NEXT: v_dual_mov_b32 v23, s18 :: v_dual_mov_b32 v24, s19 ; GFX11-NEXT: v_dual_mov_b32 v21, s16 :: v_dual_mov_b32 v22, s17 -; GFX11-NEXT: v_mov_b32_e32 v144, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 4 +; GFX11-NEXT: v_mov_b32_e32 v134, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 4 ; GFX11-NEXT: v_dual_mov_b32 v17, s14 :: v_dual_mov_b32 v18, s15 ; GFX11-NEXT: v_dual_mov_b32 v13, s12 :: v_dual_mov_b32 v14, s13 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v134, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 5 +; GFX11-NEXT: v_mov_b32_e32 v132, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 5 ; GFX11-NEXT: v_dual_mov_b32 v11, s10 :: v_dual_mov_b32 v12, s11 ; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_mov_b32_e32 v135, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-NEXT: v_mov_b32_e32 v133, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 6 ; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v6, s5 ; GFX11-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v4, s7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v133, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-NEXT: v_mov_b32_e32 v131, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 7 ; GFX11-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9 -; GFX11-NEXT: v_dual_mov_b32 v74, s35 :: v_dual_mov_b32 v73, s104 -; GFX11-NEXT: v_mov_b32_e32 v132, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 8 -; GFX11-NEXT: v_dual_mov_b32 v72, s34 :: v_dual_mov_b32 v63, s103 -; GFX11-NEXT: v_dual_mov_b32 v62, s102 :: v_dual_mov_b32 v61, s101 +; GFX11-NEXT: v_dual_mov_b32 v72, s35 :: v_dual_mov_b32 v63, s104 +; GFX11-NEXT: v_mov_b32_e32 v130, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 8 +; GFX11-NEXT: v_dual_mov_b32 v62, s34 :: v_dual_mov_b32 v61, s103 +; GFX11-NEXT: v_dual_mov_b32 v60, s102 :: v_dual_mov_b32 v59, s101 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v131, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 9 -; GFX11-NEXT: v_dual_mov_b32 v60, s99 :: v_dual_mov_b32 v59, s100 -; GFX11-NEXT: v_dual_mov_b32 v58, s98 :: v_dual_mov_b32 v57, s97 ; GFX11-NEXT: v_mov_b32_e32 v129, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 10 -; GFX11-NEXT: v_dual_mov_b32 v56, s96 :: v_dual_mov_b32 v47, s86 -; GFX11-NEXT: v_dual_mov_b32 v46, s87 :: v_dual_mov_b32 v45, s85 +; GFX11-NEXT: v_readlane_b32 s0, v75, 9 +; GFX11-NEXT: v_dual_mov_b32 v58, s99 :: v_dual_mov_b32 v57, s100 +; GFX11-NEXT: v_dual_mov_b32 v56, s98 :: v_dual_mov_b32 v47, s97 +; GFX11-NEXT: v_mov_b32_e32 v119, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 10 +; GFX11-NEXT: v_dual_mov_b32 v46, s96 :: v_dual_mov_b32 v45, s86 +; GFX11-NEXT: v_dual_mov_b32 v44, s87 :: v_dual_mov_b32 v43, s85 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v130, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 11 -; GFX11-NEXT: v_dual_mov_b32 v44, s84 :: v_dual_mov_b32 v43, s83 -; GFX11-NEXT: v_dual_mov_b32 v42, s81 :: v_dual_mov_b32 v41, s82 ; GFX11-NEXT: v_mov_b32_e32 v128, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 12 -; GFX11-NEXT: v_dual_mov_b32 v40, s80 :: v_dual_mov_b32 v183, s71 -; GFX11-NEXT: v_dual_mov_b32 v182, s70 :: v_dual_mov_b32 v181, s68 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v119, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 13 -; GFX11-NEXT: v_dual_mov_b32 v180, s69 :: v_dual_mov_b32 v179, s67 -; GFX11-NEXT: v_dual_mov_b32 v178, s66 :: v_dual_mov_b32 v177, s65 +; GFX11-NEXT: v_readlane_b32 s0, v75, 11 +; GFX11-NEXT: v_dual_mov_b32 v42, s84 :: v_dual_mov_b32 v41, s83 +; GFX11-NEXT: v_dual_mov_b32 v40, s81 :: v_dual_mov_b32 v183, s82 ; GFX11-NEXT: v_mov_b32_e32 v118, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 14 -; GFX11-NEXT: v_dual_mov_b32 v176, s55 :: v_dual_mov_b32 v167, s64 -; GFX11-NEXT: v_dual_mov_b32 v166, s54 :: v_dual_mov_b32 v165, s53 +; GFX11-NEXT: v_readlane_b32 s0, v75, 12 +; GFX11-NEXT: v_dual_mov_b32 v182, s80 :: v_dual_mov_b32 v181, s71 +; GFX11-NEXT: v_dual_mov_b32 v180, s70 :: v_dual_mov_b32 v179, s68 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v116, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 15 -; GFX11-NEXT: v_dual_mov_b32 v164, s52 :: v_dual_mov_b32 v163, s50 -; GFX11-NEXT: v_dual_mov_b32 v162, s51 :: v_dual_mov_b32 v161, s49 ; GFX11-NEXT: v_mov_b32_e32 v117, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 16 -; GFX11-NEXT: v_dual_mov_b32 v160, s48 :: v_dual_mov_b32 v151, s39 -; GFX11-NEXT: v_dual_mov_b32 v150, s37 :: v_dual_mov_b32 v149, s38 +; GFX11-NEXT: v_readlane_b32 s0, v75, 13 +; GFX11-NEXT: v_dual_mov_b32 v178, s69 :: v_dual_mov_b32 v177, s67 +; GFX11-NEXT: v_dual_mov_b32 v176, s66 :: v_dual_mov_b32 v167, s65 +; GFX11-NEXT: v_mov_b32_e32 v116, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 14 +; GFX11-NEXT: v_dual_mov_b32 v166, s55 :: v_dual_mov_b32 v165, s64 +; GFX11-NEXT: v_dual_mov_b32 v164, s54 :: v_dual_mov_b32 v163, s53 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v115, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 17 -; GFX11-NEXT: v_dual_mov_b32 v15, s62 :: v_dual_mov_b32 v38, s88 -; GFX11-NEXT: v_dual_mov_b32 v19, s72 :: v_dual_mov_b32 v50, s90 ; GFX11-NEXT: v_mov_b32_e32 v114, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 18 -; GFX11-NEXT: v_dual_mov_b32 v25, s74 :: v_dual_mov_b32 v54, s92 -; GFX11-NEXT: v_dual_mov_b32 v64, s94 :: v_dual_mov_b32 v65, s30 +; GFX11-NEXT: v_readlane_b32 s0, v75, 15 +; GFX11-NEXT: v_dual_mov_b32 v162, s52 :: v_dual_mov_b32 v161, s50 +; GFX11-NEXT: v_dual_mov_b32 v160, s51 :: v_dual_mov_b32 v151, s49 +; GFX11-NEXT: v_mov_b32_e32 v115, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 16 +; GFX11-NEXT: v_dual_mov_b32 v150, s48 :: v_dual_mov_b32 v149, s39 +; GFX11-NEXT: v_dual_mov_b32 v148, s37 :: v_dual_mov_b32 v147, s38 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v113, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 19 -; GFX11-NEXT: v_dual_mov_b32 v66, s60 :: v_dual_mov_b32 v67, s58 -; GFX11-NEXT: v_dual_mov_b32 v68, s56 :: v_dual_mov_b32 v69, s46 -; GFX11-NEXT: v_mov_b32_e32 v103, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 20 -; GFX11-NEXT: v_mov_b32_e32 v70, s44 -; GFX11-NEXT: v_mov_b32_e32 v80, s42 -; GFX11-NEXT: v_mov_b32_e32 v26, s76 -; GFX11-NEXT: v_mov_b32_e32 v32, s78 +; GFX11-NEXT: v_readlane_b32 s0, v75, 17 +; GFX11-NEXT: v_dual_mov_b32 v15, s30 :: v_dual_mov_b32 v54, s74 +; GFX11-NEXT: v_dual_mov_b32 v19, s94 :: v_dual_mov_b32 v64, s72 ; GFX11-NEXT: v_mov_b32_e32 v112, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 21 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v102, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 22 +; GFX11-NEXT: v_readlane_b32 s0, v75, 18 +; GFX11-NEXT: v_dual_mov_b32 v25, s92 :: v_dual_mov_b32 v66, s60 +; GFX11-NEXT: v_dual_mov_b32 v65, s62 :: v_dual_mov_b32 v68, s56 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v103, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 19 +; GFX11-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v70, s44 +; GFX11-NEXT: v_dual_mov_b32 v69, s46 :: v_dual_mov_b32 v80, s42 ; GFX11-NEXT: v_mov_b32_e32 v101, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 23 +; GFX11-NEXT: v_readlane_b32 s0, v75, 20 +; GFX11-NEXT: v_mov_b32_e32 v26, s90 +; GFX11-NEXT: v_mov_b32_e32 v32, s88 +; GFX11-NEXT: v_mov_b32_e32 v38, s78 +; GFX11-NEXT: v_mov_b32_e32 v50, s76 +; GFX11-NEXT: v_mov_b32_e32 v102, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 21 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v100, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 24 -; GFX11-NEXT: v_mov_b32_e32 v98, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v75, 22 ; GFX11-NEXT: v_mov_b32_e32 v99, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 26 -; GFX11-NEXT: v_mov_b32_e32 v97, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 27 +; GFX11-NEXT: v_readlane_b32 s0, v75, 23 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v98, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 24 ; GFX11-NEXT: v_mov_b32_e32 v96, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 28 +; GFX11-NEXT: v_readlane_b32 s0, v75, 25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v97, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 26 ; GFX11-NEXT: v_mov_b32_e32 v87, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 29 +; GFX11-NEXT: v_readlane_b32 s0, v75, 27 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v85, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 30 ; GFX11-NEXT: v_mov_b32_e32 v86, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 31 +; GFX11-NEXT: v_readlane_b32 s0, v75, 28 +; GFX11-NEXT: v_mov_b32_e32 v85, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 29 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v84, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 0 ; GFX11-NEXT: v_mov_b32_e32 v83, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 1 +; GFX11-NEXT: v_readlane_b32 s0, v75, 30 +; GFX11-NEXT: v_mov_b32_e32 v84, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 31 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v82, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 2 +; GFX11-NEXT: v_readlane_b32 s0, v76, 0 +; GFX11-NEXT: v_mov_b32_e32 v81, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v71, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 2 ; GFX11-NEXT: v_mov_b32_e32 v51, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 3 +; GFX11-NEXT: v_readlane_b32 s0, v76, 3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v55, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 4 +; GFX11-NEXT: v_readlane_b32 s0, v76, 4 ; GFX11-NEXT: v_mov_b32_e32 v39, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-NEXT: v_readlane_b32 s0, v76, 5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v33, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 6 +; GFX11-NEXT: v_readlane_b32 s0, v76, 6 ; GFX11-NEXT: v_mov_b32_e32 v27, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 7 +; GFX11-NEXT: v_readlane_b32 s0, v76, 7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v20, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 8 +; GFX11-NEXT: v_readlane_b32 s0, v76, 8 ; GFX11-NEXT: v_mov_b32_e32 v16, s0 ; GFX11-NEXT: .LBB37_5: ; %end -; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v80 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v74 ; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v52 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v72, 8, v72 +; GFX11-NEXT: v_and_b32_e32 v63, 0xff, v63 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v80 ; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v62 -; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v70 -; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GFX11-NEXT: v_or_b32_e32 v52, v52, v80 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v73 -; GFX11-NEXT: v_and_b32_e32 v60, 0xff, v60 -; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v49 -; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-NEXT: v_lshlrev_b32_e32 v62, 8, v62 +; GFX11-NEXT: v_or_b32_e32 v52, v52, v72 +; GFX11-NEXT: v_and_b32_e32 v61, 0xff, v61 +; GFX11-NEXT: v_or_b32_e32 v80, v63, v80 +; GFX11-NEXT: v_lshlrev_b32_e32 v60, 8, v60 +; GFX11-NEXT: v_or_b32_e32 v53, v53, v62 ; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v52 -; GFX11-NEXT: v_or_b32_e32 v71, v80, v71 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v72 -; GFX11-NEXT: v_or_b32_e32 v70, v60, v70 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v69 -; GFX11-NEXT: v_and_b32_e32 v37, 0xff, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v71 -; GFX11-NEXT: v_or_b32_e32 v53, v53, v80 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v63 -; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v68 -; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GFX11-NEXT: v_or_b32_e32 v60, v52, v71 +; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v80 +; GFX11-NEXT: v_or_b32_e32 v60, v61, v60 +; GFX11-NEXT: v_lshlrev_b32_e32 v59, 8, v59 +; GFX11-NEXT: v_and_b32_e32 v61, 0xff, v58 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-NEXT: v_or_b32_e32 v58, v52, v80 ; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v53 -; GFX11-NEXT: v_or_b32_e32 v80, v80, v81 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v61 -; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v59 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v67 -; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v80 -; GFX11-NEXT: v_or_b32_e32 v48, v48, v81 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v58 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v57 -; GFX11-NEXT: v_or_b32_e32 v49, v49, v71 -; GFX11-NEXT: v_or_b32_e32 v61, v52, v53 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v60 +; GFX11-NEXT: v_or_b32_e32 v48, v48, v59 +; GFX11-NEXT: v_or_b32_e32 v70, v61, v70 +; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v49 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v57 +; GFX11-NEXT: v_and_b32_e32 v56, 0xff, v56 +; GFX11-NEXT: v_lshlrev_b32_e32 v47, 8, v47 +; GFX11-NEXT: v_or_b32_e32 v59, v52, v53 ; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 ; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v70 -; GFX11-NEXT: v_or_b32_e32 v53, v80, v81 -; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v56 -; GFX11-NEXT: v_and_b32_e32 v71, 0xff, v47 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v66 -; GFX11-NEXT: v_or_b32_e32 v62, v48, v52 +; GFX11-NEXT: v_or_b32_e32 v49, v49, v80 +; GFX11-NEXT: v_or_b32_e32 v53, v56, v47 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v46 +; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v45 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v69 +; GFX11-NEXT: v_or_b32_e32 v60, v48, v52 ; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v49 ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v53 ; GFX11-NEXT: v_or_b32_e32 v36, v36, v70 -; GFX11-NEXT: v_or_b32_e32 v52, v71, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v46 -; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v45 -; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v44 -; GFX11-NEXT: v_or_b32_e32 v63, v48, v49 +; GFX11-NEXT: v_or_b32_e32 v52, v80, v69 +; GFX11-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v44 +; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v43 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v42 +; GFX11-NEXT: v_or_b32_e32 v61, v48, v49 ; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 ; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v52 ; GFX11-NEXT: v_or_b32_e32 v37, v37, v53 ; GFX11-NEXT: v_or_b32_e32 v49, v69, v70 ; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v34 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v43 -; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v42 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v41 +; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v40 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v68 ; GFX11-NEXT: v_or_b32_e32 v34, v36, v48 ; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v37 ; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v49 ; GFX11-NEXT: v_or_b32_e32 v48, v52, v53 ; GFX11-NEXT: v_or_b32_e32 v49, v69, v68 ; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v41 -; GFX11-NEXT: v_and_b32_e32 v68, 0xff, v40 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v183 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v183 +; GFX11-NEXT: v_and_b32_e32 v68, 0xff, v182 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v181 ; GFX11-NEXT: v_or_b32_e32 v35, v36, v37 ; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v48 ; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v49 ; GFX11-NEXT: v_or_b32_e32 v48, v52, v53 ; GFX11-NEXT: v_or_b32_e32 v49, v68, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v182 -; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v181 -; GFX11-NEXT: v_or_b32_e32 v36, v36, v37 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v180 +; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v179 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v67 ; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v49 ; GFX11-NEXT: v_or_b32_e32 v30, v30, v52 +; GFX11-NEXT: v_or_b32_e32 v36, v36, v37 ; GFX11-NEXT: v_or_b32_e32 v52, v53, v67 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v177 -; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v176 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 ; GFX11-NEXT: v_or_b32_e32 v37, v48, v49 -; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v179 +; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v177 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v167 ; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v52 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v178 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v176 +; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v166 +; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v66 ; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v180 -; GFX11-NEXT: v_or_b32_e32 v28, v28, v53 -; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v178 ; GFX11-NEXT: v_or_b32_e32 v49, v49, v52 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v53 ; GFX11-NEXT: v_or_b32_e32 v52, v67, v66 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; GFX11-NEXT: v_or_b32_e32 v31, v31, v68 ; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v167 -; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v166 -; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v165 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v165 +; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v164 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v163 ; GFX11-NEXT: v_and_b32_e32 v66, 0xffff, v28 ; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v52 ; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 @@ -49514,21 +49281,21 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_or_b32_e32 v28, v30, v48 ; GFX11-NEXT: v_or_b32_e32 v30, v66, v52 ; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v151 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v149 ; GFX11-NEXT: v_and_b32_e32 v67, 0xffff, v29 ; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v53 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[60:63], off +; GFX11-NEXT: scratch_store_b128 v0, v[58:61], off ; GFX11-NEXT: scratch_store_b128 v0, v[34:37], off offset:16 ; GFX11-NEXT: v_or_b32_e32 v29, v31, v49 ; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v164 -; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v163 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v162 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v161 ; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v65 ; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v162 -; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v161 -; GFX11-NEXT: v_lshlrev_b32_e32 v49, 8, v160 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v160 +; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v151 +; GFX11-NEXT: v_lshlrev_b32_e32 v49, 8, v150 ; GFX11-NEXT: v_or_b32_e32 v21, v21, v52 ; GFX11-NEXT: v_or_b32_e32 v31, v67, v53 ; GFX11-NEXT: v_or_b32_e32 v23, v23, v34 @@ -49536,16 +49303,16 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_or_b32_e32 v24, v24, v37 ; GFX11-NEXT: v_or_b32_e32 v35, v48, v49 ; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v21 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v150 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v148 ; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v64 ; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v149 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v147 ; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v146 -; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v145 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v144 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v135 ; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v54 -; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v147 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v148 +; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v145 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v146 ; GFX11-NEXT: v_or_b32_e32 v21, v21, v37 ; GFX11-NEXT: v_or_b32_e32 v22, v22, v48 ; GFX11-NEXT: v_or_b32_e32 v17, v17, v53 @@ -49565,32 +49332,32 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_or_b32_e32 v23, v36, v49 ; GFX11-NEXT: v_or_b32_e32 v34, v17, v48 ; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v18 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v144 -; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v134 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v135 -; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v132 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v134 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v132 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v133 +; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v130 ; GFX11-NEXT: v_lshlrev_b32_e32 v49, 8, v50 ; GFX11-NEXT: v_or_b32_e32 v24, v52, v37 ; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v133 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v131 ; GFX11-NEXT: v_or_b32_e32 v17, v17, v18 ; GFX11-NEXT: v_or_b32_e32 v18, v35, v36 ; GFX11-NEXT: v_or_b32_e32 v35, v48, v49 ; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v131 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v129 ; GFX11-NEXT: v_or_b32_e32 v13, v13, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v130 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v128 ; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v35 -; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v129 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v119 ; GFX11-NEXT: v_or_b32_e32 v14, v14, v50 ; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v128 -; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v119 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v118 +; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v117 ; GFX11-NEXT: v_lshlrev_b32_e32 v38, 8, v38 ; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v118 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v116 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v117 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v116 +; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v114 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v115 ; GFX11-NEXT: v_or_b32_e32 v35, v35, v37 ; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 @@ -49609,32 +49376,32 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_or_b32_e32 v36, v13, v36 ; GFX11-NEXT: v_or_b32_e32 v37, v14, v48 ; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v115 -; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v113 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v112 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v32 ; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v113 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v103 ; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v102 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v100 ; GFX11-NEXT: v_or_b32_e32 v11, v11, v49 ; GFX11-NEXT: v_or_b32_e32 v12, v12, v38 -; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v103 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 8, v112 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v101 +; GFX11-NEXT: v_lshlrev_b32_e32 v38, 8, v102 ; GFX11-NEXT: v_or_b32_e32 v9, v9, v13 ; GFX11-NEXT: v_or_b32_e32 v13, v14, v17 ; GFX11-NEXT: v_or_b32_e32 v10, v10, v18 ; GFX11-NEXT: v_or_b32_e32 v7, v7, v48 -; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v101 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v99 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v26 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v97 -; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v96 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v87 +; GFX11-NEXT: v_and_b32_e32 v49, 0xff, v86 ; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; GFX11-NEXT: v_or_b32_e32 v14, v32, v38 ; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v26, 8, v100 -; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v98 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 8, v99 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 8, v98 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v96 +; GFX11-NEXT: v_lshlrev_b32_e32 v38, 8, v97 ; GFX11-NEXT: v_or_b32_e32 v17, v17, v18 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v48 ; GFX11-NEXT: v_or_b32_e32 v25, v49, v25 @@ -49655,16 +49422,16 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_or_b32_e32 v7, v7, v17 ; GFX11-NEXT: v_or_b32_e32 v9, v5, v25 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v87 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v85 -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v86 -; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v83 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v83 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v84 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v81 ; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; GFX11-NEXT: v_or_b32_e32 v8, v8, v18 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v84 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v82 ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v26, 8, v82 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 8, v71 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX11-NEXT: v_or_b32_e32 v6, v10, v17 ; GFX11-NEXT: v_or_b32_e32 v10, v25, v19 @@ -49707,73 +49474,71 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v74, off, s32 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:72 -; GFX11-NEXT: v_readlane_b32 s104, v76, 8 -; GFX11-NEXT: v_readlane_b32 s103, v76, 7 -; GFX11-NEXT: v_readlane_b32 s102, v76, 6 -; GFX11-NEXT: v_readlane_b32 s101, v76, 5 -; GFX11-NEXT: v_readlane_b32 s100, v76, 4 -; GFX11-NEXT: v_readlane_b32 s99, v76, 3 -; GFX11-NEXT: v_readlane_b32 s98, v76, 2 -; GFX11-NEXT: v_readlane_b32 s97, v76, 1 -; GFX11-NEXT: v_readlane_b32 s96, v76, 0 -; GFX11-NEXT: v_readlane_b32 s87, v75, 31 -; GFX11-NEXT: v_readlane_b32 s86, v75, 30 -; GFX11-NEXT: v_readlane_b32 s85, v75, 29 -; GFX11-NEXT: v_readlane_b32 s84, v75, 28 -; GFX11-NEXT: v_readlane_b32 s83, v75, 27 -; GFX11-NEXT: v_readlane_b32 s82, v75, 26 -; GFX11-NEXT: v_readlane_b32 s81, v75, 25 -; GFX11-NEXT: v_readlane_b32 s80, v75, 24 -; GFX11-NEXT: v_readlane_b32 s71, v75, 23 -; GFX11-NEXT: v_readlane_b32 s70, v75, 22 -; GFX11-NEXT: v_readlane_b32 s69, v75, 21 -; GFX11-NEXT: v_readlane_b32 s68, v75, 20 -; GFX11-NEXT: v_readlane_b32 s67, v75, 19 -; GFX11-NEXT: v_readlane_b32 s66, v75, 18 -; GFX11-NEXT: v_readlane_b32 s65, v75, 17 -; GFX11-NEXT: v_readlane_b32 s64, v75, 16 -; GFX11-NEXT: v_readlane_b32 s55, v75, 15 -; GFX11-NEXT: v_readlane_b32 s54, v75, 14 -; GFX11-NEXT: v_readlane_b32 s53, v75, 13 -; GFX11-NEXT: v_readlane_b32 s52, v75, 12 -; GFX11-NEXT: v_readlane_b32 s51, v75, 11 -; GFX11-NEXT: v_readlane_b32 s50, v75, 10 -; GFX11-NEXT: v_readlane_b32 s49, v75, 9 -; GFX11-NEXT: v_readlane_b32 s48, v75, 8 -; GFX11-NEXT: v_readlane_b32 s39, v75, 7 -; GFX11-NEXT: v_readlane_b32 s38, v75, 6 -; GFX11-NEXT: v_readlane_b32 s37, v75, 5 -; GFX11-NEXT: v_readlane_b32 s36, v75, 4 -; GFX11-NEXT: v_readlane_b32 s35, v75, 3 -; GFX11-NEXT: v_readlane_b32 s34, v75, 2 -; GFX11-NEXT: v_readlane_b32 s31, v75, 1 -; GFX11-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-NEXT: s_clause 0x10 ; 68-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v72, off, s32 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:64 +; GFX11-NEXT: v_readlane_b32 s104, v74, 8 +; GFX11-NEXT: v_readlane_b32 s103, v74, 7 +; GFX11-NEXT: v_readlane_b32 s102, v74, 6 +; GFX11-NEXT: v_readlane_b32 s101, v74, 5 +; GFX11-NEXT: v_readlane_b32 s100, v74, 4 +; GFX11-NEXT: v_readlane_b32 s99, v74, 3 +; GFX11-NEXT: v_readlane_b32 s98, v74, 2 +; GFX11-NEXT: v_readlane_b32 s97, v74, 1 +; GFX11-NEXT: v_readlane_b32 s96, v74, 0 +; GFX11-NEXT: v_readlane_b32 s87, v73, 31 +; GFX11-NEXT: v_readlane_b32 s86, v73, 30 +; GFX11-NEXT: v_readlane_b32 s85, v73, 29 +; GFX11-NEXT: v_readlane_b32 s84, v73, 28 +; GFX11-NEXT: v_readlane_b32 s83, v73, 27 +; GFX11-NEXT: v_readlane_b32 s82, v73, 26 +; GFX11-NEXT: v_readlane_b32 s81, v73, 25 +; GFX11-NEXT: v_readlane_b32 s80, v73, 24 +; GFX11-NEXT: v_readlane_b32 s71, v73, 23 +; GFX11-NEXT: v_readlane_b32 s70, v73, 22 +; GFX11-NEXT: v_readlane_b32 s69, v73, 21 +; GFX11-NEXT: v_readlane_b32 s68, v73, 20 +; GFX11-NEXT: v_readlane_b32 s67, v73, 19 +; GFX11-NEXT: v_readlane_b32 s66, v73, 18 +; GFX11-NEXT: v_readlane_b32 s65, v73, 17 +; GFX11-NEXT: v_readlane_b32 s64, v73, 16 +; GFX11-NEXT: v_readlane_b32 s55, v73, 15 +; GFX11-NEXT: v_readlane_b32 s54, v73, 14 +; GFX11-NEXT: v_readlane_b32 s53, v73, 13 +; GFX11-NEXT: v_readlane_b32 s52, v73, 12 +; GFX11-NEXT: v_readlane_b32 s51, v73, 11 +; GFX11-NEXT: v_readlane_b32 s50, v73, 10 +; GFX11-NEXT: v_readlane_b32 s49, v73, 9 +; GFX11-NEXT: v_readlane_b32 s48, v73, 8 +; GFX11-NEXT: v_readlane_b32 s39, v73, 7 +; GFX11-NEXT: v_readlane_b32 s38, v73, 6 +; GFX11-NEXT: v_readlane_b32 s37, v73, 5 +; GFX11-NEXT: v_readlane_b32 s36, v73, 4 +; GFX11-NEXT: v_readlane_b32 s35, v73, 3 +; GFX11-NEXT: v_readlane_b32 s34, v73, 2 +; GFX11-NEXT: v_readlane_b32 s31, v73, 1 +; GFX11-NEXT: v_readlane_b32 s30, v73, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:72 ; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:88 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -49814,22 +49579,22 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 @@ -49848,33 +49613,33 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:152 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:160 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:184 -; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v25 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -49931,19 +49696,19 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v32 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 @@ -49951,19 +49716,19 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -49974,7 +49739,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 @@ -49982,27 +49747,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 @@ -50011,15 +49776,15 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 @@ -50033,24 +49798,24 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 @@ -50059,29 +49824,26 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:268 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 @@ -50090,29 +49852,29 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 @@ -50121,29 +49883,29 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 @@ -50152,239 +49914,307 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB38_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: v_or_b32_e32 v3, v40, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v4, v45, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v5, v41, v5 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v61, v8 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v55, v8 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v36 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v43 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v63 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v6, v56 +; SI-NEXT: v_or_b32_e32 v6, v6, v34 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v7, v46 +; SI-NEXT: v_or_b32_e32 v7, v7, v38 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v35 -; SI-NEXT: v_or_b32_e32 v8, v8, v44 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v8, v8, v55 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v46 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v51 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v63 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v44 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v42 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v35 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v51 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v45 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -50402,204 +50232,299 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v59 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 ; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 ; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; SI-NEXT: v_or_b32_e32 v31, v31, v43 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v32, v33, v32 ; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -50611,351 +50536,189 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: .LBB38_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB38_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v45, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v5, v41, v5 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v61, v8 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v55, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v2, v52, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v4, v43, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_or_b32_e32 v5, v63, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v56, v6 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v46, v7 +; SI-NEXT: v_or_b32_e32 v7, v38, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v8, v44, v8 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 @@ -50963,15 +50726,14 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 @@ -50979,12 +50741,12 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 @@ -50992,12 +50754,12 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 @@ -51005,12 +50767,12 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 @@ -51018,12 +50780,14 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 @@ -51055,7 +50819,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51064,7 +50828,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -51072,7 +50836,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51081,7 +50845,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -51089,7 +50853,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51098,7 +50862,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v18 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -51106,7 +50870,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51115,7 +50879,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v19 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -51123,7 +50887,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51132,7 +50896,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -51140,7 +50904,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51149,7 +50913,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -51157,7 +50921,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51166,15 +50930,15 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51183,32 +50947,30 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51217,15 +50979,15 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51234,15 +50996,15 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51251,15 +51013,15 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51268,15 +51030,15 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51285,15 +51047,15 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51302,19 +51064,21 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_or_b32_e32 v31, v43, v31 ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v32, v33, v32 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 @@ -51391,16 +51155,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v3 ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v9 @@ -51436,43 +51200,42 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v41 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v43 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill @@ -51482,13 +51245,13 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 @@ -51498,11 +51261,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 @@ -51511,7 +51274,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill @@ -51521,7 +51284,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill @@ -51535,19 +51298,19 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill @@ -51561,25 +51324,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -51587,25 +51350,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -51613,25 +51376,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -51639,15 +51402,15 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 @@ -51656,88 +51419,88 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v14, v44, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v62, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: v_or_b32_sdwa v15, v41, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -51761,84 +51524,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr61 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -51852,41 +51547,41 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v43, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -51904,11 +51599,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -51921,17 +51616,17 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -51940,289 +51635,359 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: .LBB38_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB38_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v31, 0x300 +; VI-NEXT: v_add_u16_e32 v9, 3, v62 ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v2, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 -; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v4, v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v5, v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 ; VI-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 -; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -52254,11 +52019,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v62 +; VI-NEXT: v_add_u16_e32 v8, 3, v63 ; VI-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 ; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v32 +; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v10, 3, v60 ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -52267,9 +52032,8 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v10, v10, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v9, v9, v10 ; VI-NEXT: v_add_u16_e32 v10, 3, v58 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v11, 3, v56 ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -52291,7 +52055,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v13, 3, v46 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 ; VI-NEXT: v_add_u16_sdwa v13, v13, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v12, v12, v13 @@ -52299,39 +52063,38 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 3, v44 -; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 ; VI-NEXT: v_add_u16_sdwa v14, v14, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v42 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v14, 3, v43 ; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v15, 3, v40 +; VI-NEXT: v_add_u16_e32 v15, 3, v41 ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_add_u16_sdwa v15, v15, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v14, v14, v15 ; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v15, 3, v15 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 @@ -52339,12 +52102,12 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v16, v17 ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v18, 3, v18 @@ -52377,7 +52140,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 @@ -52396,14 +52159,14 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v21, 3, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v21, 0x300, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v22, v22, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 @@ -52430,46 +52193,46 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v24, v24, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v23, v24 ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v24, 3, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 ; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v26, 3, v26 ; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v26, v26, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v25, v26 -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v26, 3, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 ; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v26, v26, v27 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -52481,21 +52244,21 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v28, v28, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v27, v27, v28 -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v28, 3, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v29, v29, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v28, v28, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -52508,7 +52271,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v29, v29, v30 ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v30, 3, v30 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -52520,7 +52283,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v32, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v30, v30, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v32, 3, v32 @@ -52530,7 +52293,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v33, 3, v33 -; VI-NEXT: v_or_b32_sdwa v33, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 ; VI-NEXT: .LBB38_4: ; %end @@ -52607,16 +52370,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 -; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v9 @@ -52662,47 +52425,45 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v32 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v43 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v42 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill @@ -52712,13 +52473,13 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: s_nop 0 @@ -52729,11 +52490,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 ; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 @@ -52742,7 +52503,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill @@ -52754,7 +52515,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill @@ -52769,19 +52530,19 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill @@ -52796,25 +52557,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -52823,25 +52584,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -52850,25 +52611,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -52877,106 +52638,105 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB38_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v62, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_or_b32_sdwa v15, v41, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -53000,84 +52760,16 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -53091,41 +52783,41 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v43, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -53143,11 +52835,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -53160,17 +52852,17 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -53179,296 +52871,363 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: .LBB38_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB38_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v62 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(27) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(27) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 ; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -53500,11 +53259,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v62 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 ; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v32 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v10, 3, v60 ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -53513,7 +53272,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v11, 3, v56 @@ -53536,7 +53295,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v13, 3, v46 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 ; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 @@ -53544,39 +53303,38 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v14, 3, v43 ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v41 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 @@ -53584,12 +53342,12 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 @@ -53622,7 +53380,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 @@ -53641,14 +53399,14 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 ; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 @@ -53675,46 +53433,46 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 ; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v25 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 ; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -53726,21 +53484,21 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -53753,7 +53511,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -53765,7 +53523,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 @@ -53775,7 +53533,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v32, v63, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 ; GFX9-NEXT: .LBB38_4: ; %end @@ -55592,254 +55350,244 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:152 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v25 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v21 -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v17 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v25 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v45 -; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v44 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v43 -; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v42 -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v54 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v0 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -55847,920 +55595,933 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB39_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v59, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 ; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v2, v2, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v61 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: v_mov_b32_e32 v2, v9 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v47, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v29, v1 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v0, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 -; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v43 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v32 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mov_b32_e32 v50, v16 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v16, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: v_mov_b32_e32 v48, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v17, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v18, v1 ; SI-NEXT: v_or_b32_e32 v18, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v19, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 -; SI-NEXT: v_mov_b32_e32 v54, v23 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v20, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v3 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v21, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v22, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v45, v24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v34, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v23, v1 -; SI-NEXT: v_or_b32_e32 v23, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v61 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v24, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v25, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 -; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v26, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v37, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v52, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_or_b32_e32 v27, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_mov_b32_e32 v34, v41 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v46, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v28, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_or_b32_e32 v29, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v30, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v31, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s20, 0xff -; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s22, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s24, 0xff -; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s26, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_branch .LBB39_3 ; SI-NEXT: .LBB39_2: -; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v43 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v34, v41 +; SI-NEXT: v_mov_b32_e32 v33, v60 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB39_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mov_b32_e32 v35, v57 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) expcnt(3) +; SI-NEXT: v_mov_b32_e32 v59, v62 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB39_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_and_b32 s8, s26, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v0, s4, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, s7, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v6, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v7, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v8, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v9, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v10, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v10 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v11, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v11 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v12, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v12 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v13, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v13 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v14, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v50, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v16, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v59, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v17, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v18, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v18 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v19, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v2 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v19 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v57, v4 +; SI-NEXT: v_or_b32_e32 v20, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v21, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v46, v1 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -56768,14 +56529,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -56783,20 +56544,19 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v0 @@ -56842,115 +56602,114 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v23 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 @@ -56961,26 +56720,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v11 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v7 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -56990,807 +56745,814 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:116 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB39_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff ; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_and_b32 s4, s20, 0xff +; VI-NEXT: s_lshl_b32 s5, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff ; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s24, 0xff +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s8, s4, s5 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v47, v1 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v46, v0 -; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v62, v0 -; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v63, v1 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v1 +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_mov_b32_e32 v60, v0 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v35, v0 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v44, v0 +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v34, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v59, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v63, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v51 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v61 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v55, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v37, v53 ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v54, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v53, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v41, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v33 +; VI-NEXT: v_mov_b32_e32 v38, v54 +; VI-NEXT: v_or_b32_sdwa v0, v54, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v48, v63 +; VI-NEXT: v_mov_b32_e32 v46, v33 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v51, v34 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v61, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v56 -; VI-NEXT: v_or_b32_sdwa v0, v56, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v38, v39 -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v53 -; VI-NEXT: v_mov_b32_e32 v52, v36 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v0, v36, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v60 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v1, v33, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v34, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v36, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v62, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v59, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v50, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v58, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v49, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v51, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v57, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v56, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v48, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v39, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v50, v40 -; VI-NEXT: v_mov_b32_e32 v49, v51 -; VI-NEXT: v_mov_b32_e32 v40, v34 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v36, v62 +; VI-NEXT: v_mov_b32_e32 v59, v58 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v62, v32 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 -; VI-NEXT: s_and_b32 s4, s16, 0xff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff -; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_branch .LBB39_3 ; VI-NEXT: .LBB39_2: -; VI-NEXT: v_mov_b32_e32 v44, v56 -; VI-NEXT: v_mov_b32_e32 v41, v33 -; VI-NEXT: v_mov_b32_e32 v50, v40 -; VI-NEXT: v_mov_b32_e32 v38, v39 -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v54, v53 -; VI-NEXT: v_mov_b32_e32 v52, v36 -; VI-NEXT: v_mov_b32_e32 v49, v51 +; VI-NEXT: v_mov_b32_e32 v48, v63 +; VI-NEXT: v_mov_b32_e32 v37, v53 +; VI-NEXT: v_mov_b32_e32 v35, v51 +; VI-NEXT: v_mov_b32_e32 v38, v54 +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v36, v62 +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v46, v33 +; VI-NEXT: v_mov_b32_e32 v50, v32 +; VI-NEXT: v_mov_b32_e32 v59, v58 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB39_3: ; %Flow -; VI-NEXT: v_mov_b32_e32 v51, v41 -; VI-NEXT: v_mov_b32_e32 v36, v44 -; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v54, v60 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v52, v59 +; VI-NEXT: v_mov_b32_e32 v58, v36 +; VI-NEXT: v_mov_b32_e32 v59, v38 ; VI-NEXT: s_cbranch_vccnz .LBB39_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_mov_b32_e32 v33, v35 +; VI-NEXT: v_mov_b32_e32 v35, v37 +; VI-NEXT: v_mov_b32_e32 v37, v48 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v0, s7, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v16 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v17 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v41, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v19, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v2 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v21, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v21 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v46 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v59 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v61 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v58 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v56 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v47 +; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v45 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -57835,128 +57597,130 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:80 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:144 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:176 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v41 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v55 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v45 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v20 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v28 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v30 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 @@ -57966,7 +57730,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; GFX9-NEXT: v_lshlrev_b32_e32 v41, 8, v41 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 @@ -57978,16 +57741,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v7 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -57998,423 +57760,410 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v13 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v5 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 -; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v9 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 -; GFX9-NEXT: s_waitcnt vmcnt(29) -; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:252 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB39_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s29, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s21, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s7, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s8, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v59, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_mov_b32_e32 v61, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_mov_b32_e32 v37, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v47, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v61, v52 +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v35, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v35, v62 -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v40, v30 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v36, v31 -; GFX9-NEXT: v_mov_b32_e32 v45, v62 -; GFX9-NEXT: v_mov_b32_e32 v46, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v53 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v46, v42 +; GFX9-NEXT: v_mov_b32_e32 v53, v51 +; GFX9-NEXT: v_mov_b32_e32 v50, v33 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 +; GFX9-NEXT: v_mov_b32_e32 v42, v36 +; GFX9-NEXT: v_mov_b32_e32 v51, v32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 -; GFX9-NEXT: s_and_b32 s4, s16, 0xff -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s18, 0xff -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s22, 0xff -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_branch .LBB39_3 ; GFX9-NEXT: .LBB39_2: -; GFX9-NEXT: v_mov_b32_e32 v33, v43 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v35, v62 -; GFX9-NEXT: v_mov_b32_e32 v36, v31 -; GFX9-NEXT: v_mov_b32_e32 v40, v30 +; GFX9-NEXT: v_mov_b32_e32 v61, v52 +; GFX9-NEXT: v_mov_b32_e32 v46, v42 +; GFX9-NEXT: v_mov_b32_e32 v53, v51 +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v50, v33 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB39_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v62, v35 -; GFX9-NEXT: v_mov_b32_e32 v35, v38 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB39_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -58458,160 +58207,163 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_and_b32 s8, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s29, 8 ; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 ; GFX9-NEXT: s_movk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, v53 +; GFX9-NEXT: v_mov_b32_e32 v54, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v50 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 @@ -58619,40 +58371,41 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 @@ -58660,153 +58413,159 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 3, v53 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v46 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v62 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v58 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v57 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v56 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -59052,309 +58811,241 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB39_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v91 -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v49 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 -; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v78 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v79 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v35 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v88 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v63 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v72 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v62 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v73 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v74 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v75 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v47 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v45 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v58 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v59 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v182 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v56 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v40 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v40 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v41 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v165 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v43 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v44 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v163 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v167 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v16, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v148 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v177 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v146 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v178 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v179 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v18, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v147 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v118 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v150 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v103 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v151 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v160 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v20, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v112 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v101 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v132 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v112 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v132 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v133 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v135 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v84 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v102 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v144 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v144 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v119 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v128 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v25, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v130 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v24, v131 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v84 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v130 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v131 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v80 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v114 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v116 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v27, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v94, 0xff, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v81 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v116 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v69 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v2, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v4, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v94, v89 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v99 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v30, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v65 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff -; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 -; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v2, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB39_3 @@ -59993,309 +59684,241 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB39_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91 -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v49 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 -; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v6, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v78 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v35 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v8, v63 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v33 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v62 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v10, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v47 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v45 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v12, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v182 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v13, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v180 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v14, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v165 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v15, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v163 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v16, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v148 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v177 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v146 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v18, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v118 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v103 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v20, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v101 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v21, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v86 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v22, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v23, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v81 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v24, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v25, v113 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v24, v131 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v2, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v94, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v2, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v4, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v94, v89 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v99 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v30, v31 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v2, v3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB39_3 @@ -61663,22 +61286,22 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_readfirstlane_b32 s14, v20 ; SI-NEXT: v_mov_b32_e32 v20, s21 ; SI-NEXT: v_writelane_b32 v63, s83, 27 -; SI-NEXT: v_readfirstlane_b32 s8, v20 +; SI-NEXT: v_readfirstlane_b32 s16, v20 ; SI-NEXT: v_mov_b32_e32 v20, s22 ; SI-NEXT: v_writelane_b32 v63, s84, 28 -; SI-NEXT: v_readfirstlane_b32 s9, v20 +; SI-NEXT: v_readfirstlane_b32 s8, v20 ; SI-NEXT: v_mov_b32_e32 v20, s23 ; SI-NEXT: v_writelane_b32 v63, s85, 29 -; SI-NEXT: v_readfirstlane_b32 s11, v20 +; SI-NEXT: v_readfirstlane_b32 s9, v20 ; SI-NEXT: v_mov_b32_e32 v20, s24 ; SI-NEXT: v_writelane_b32 v63, s86, 30 -; SI-NEXT: v_readfirstlane_b32 s13, v20 +; SI-NEXT: v_readfirstlane_b32 s11, v20 ; SI-NEXT: v_mov_b32_e32 v20, s25 ; SI-NEXT: v_writelane_b32 v63, s87, 31 -; SI-NEXT: v_readfirstlane_b32 s15, v20 +; SI-NEXT: v_readfirstlane_b32 s13, v20 ; SI-NEXT: v_mov_b32_e32 v20, s26 ; SI-NEXT: v_writelane_b32 v63, s96, 32 -; SI-NEXT: v_readfirstlane_b32 s16, v20 +; SI-NEXT: v_readfirstlane_b32 s15, v20 ; SI-NEXT: v_mov_b32_e32 v20, s27 ; SI-NEXT: v_writelane_b32 v63, s97, 33 ; SI-NEXT: v_readfirstlane_b32 s17, v20 @@ -61771,18 +61394,18 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: s_lshl_b32 s55, s18, 16 ; SI-NEXT: s_and_b32 s64, s17, 0xffff0000 ; SI-NEXT: s_lshl_b32 s65, s17, 16 -; SI-NEXT: s_and_b32 s66, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s67, s16, 16 -; SI-NEXT: s_and_b32 s68, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s69, s15, 16 -; SI-NEXT: s_and_b32 s70, s13, 0xffff0000 -; SI-NEXT: s_lshl_b32 s71, s13, 16 -; SI-NEXT: s_and_b32 s80, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s81, s11, 16 -; SI-NEXT: s_and_b32 s82, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s83, s9, 16 -; SI-NEXT: s_and_b32 s84, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s85, s8, 16 +; SI-NEXT: s_and_b32 s66, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s67, s15, 16 +; SI-NEXT: s_and_b32 s68, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s69, s13, 16 +; SI-NEXT: s_and_b32 s70, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s71, s11, 16 +; SI-NEXT: s_and_b32 s80, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s81, s9, 16 +; SI-NEXT: s_and_b32 s82, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s83, s8, 16 +; SI-NEXT: s_and_b32 s84, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s16, 16 ; SI-NEXT: s_and_b32 s86, s14, 0xffff0000 ; SI-NEXT: s_lshl_b32 s87, s14, 16 ; SI-NEXT: s_and_b32 s96, s12, 0xffff0000 @@ -61806,12 +61429,12 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 ; SI-NEXT: v_add_f32_e64 v2, s7, 1.0 ; SI-NEXT: v_add_f32_e64 v3, s14, 1.0 -; SI-NEXT: v_add_f32_e64 v45, s8, 1.0 -; SI-NEXT: v_add_f32_e64 v43, s9, 1.0 -; SI-NEXT: v_add_f32_e64 v41, s11, 1.0 -; SI-NEXT: v_add_f32_e64 v55, s13, 1.0 -; SI-NEXT: v_add_f32_e64 v53, s15, 1.0 -; SI-NEXT: v_add_f32_e64 v51, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v45, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v43, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v41, s9, 1.0 +; SI-NEXT: v_add_f32_e64 v55, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v53, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v51, s15, 1.0 ; SI-NEXT: v_add_f32_e64 v49, s17, 1.0 ; SI-NEXT: v_add_f32_e64 v39, s18, 1.0 ; SI-NEXT: v_add_f32_e64 v37, s19, 1.0 @@ -62985,18 +62608,18 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -65548,650 +65171,655 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: v_mov_b32_e32 v43, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v58, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v30, v28 +; SI-NEXT: v_mov_b32_e32 v28, v27 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v41, v23 -; SI-NEXT: v_mov_b32_e32 v29, v20 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v44, v20 +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v59, v29 +; SI-NEXT: v_mov_b32_e32 v60, v24 +; SI-NEXT: v_mov_b32_e32 v41, v22 +; SI-NEXT: v_mov_b32_e32 v22, v13 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v30 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s28 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v37 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v57 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 ; SI-NEXT: s_cbranch_scc0 .LBB43_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_mov_b32_e32 v47, v3 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v0, v19 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v23 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 -; SI-NEXT: v_mov_b32_e32 v1, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 -; SI-NEXT: v_mov_b32_e32 v2, v14 -; SI-NEXT: v_mov_b32_e32 v49, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v3, v16 -; SI-NEXT: v_mov_b32_e32 v20, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v63 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v4, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[3:4], v[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v5, v29 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[4:5], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v5, v9 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 -; SI-NEXT: v_mov_b32_e32 v6, v45 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[5:6], v[9:10], 16 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v57 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 -; SI-NEXT: v_mov_b32_e32 v7, v39 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[6:7], v[8:9], 16 +; SI-NEXT: v_mov_b32_e32 v7, v40 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 -; SI-NEXT: v_mov_b32_e32 v8, v9 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[7:8], v[40:41], 16 +; SI-NEXT: v_mov_b32_e32 v8, v24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_lshr_b64 v[8:9], v[24:25], 16 ; SI-NEXT: v_mov_b32_e32 v9, v54 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_mov_b32_e32 v10, v11 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: v_mov_b32_e32 v10, v53 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 -; SI-NEXT: v_mov_b32_e32 v13, v58 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[53:54], 16 +; SI-NEXT: v_mov_b32_e32 v11, v52 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 -; SI-NEXT: v_mov_b32_e32 v14, v60 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[11:12], v[52:53], 16 +; SI-NEXT: v_mov_b32_e32 v12, v51 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 -; SI-NEXT: v_mov_b32_e32 v15, v62 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[12:13], v[51:52], 16 +; SI-NEXT: v_mov_b32_e32 v13, v39 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 -; SI-NEXT: v_mov_b32_e32 v16, v32 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v14, v32 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; SI-NEXT: v_mov_b32_e32 v40, v17 -; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_mov_b32_e32 v21, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 -; SI-NEXT: v_mov_b32_e32 v22, v31 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 -; SI-NEXT: v_mov_b32_e32 v23, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 -; SI-NEXT: v_mov_b32_e32 v24, v41 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[14:15], v[32:33], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v33, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_mov_b32_e32 v53, v58 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 -; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_lshr_b64 v[16:17], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v17, v62 +; SI-NEXT: v_mov_b32_e32 v19, v63 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[17:18], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v18, v44 +; SI-NEXT: v_mov_b32_e32 v63, v19 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[18:19], v[44:45], 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_mov_b32_e32 v19, v61 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[61:62], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v56 +; SI-NEXT: v_mov_b32_e32 v56, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 +; SI-NEXT: v_mov_b32_e32 v20, v60 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[60:61], 16 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v42 +; SI-NEXT: v_mov_b32_e32 v42, v59 +; SI-NEXT: v_lshr_b64 v[21:22], v[59:60], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v47 +; SI-NEXT: v_lshr_b64 v[22:23], v[58:59], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v58, v36 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v43 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_mov_b32_e32 v23, v31 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v32, v35 ; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 -; SI-NEXT: v_mov_b32_e32 v26, v43 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v46 +; SI-NEXT: v_lshr_b64 v[26:27], v[34:35], 16 +; SI-NEXT: v_mov_b32_e32 v27, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 ; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v57 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: v_lshr_b64 v[28:29], v[50:51], 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_mov_b32_e32 v29, v30 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 -; SI-NEXT: v_mov_b32_e32 v53, v31 -; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[30:31], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v49 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshr_b64 v[34:35], v[49:50], 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[38:39], 16 +; SI-NEXT: v_mov_b32_e32 v31, v34 ; SI-NEXT: s_branch .LBB43_3 ; SI-NEXT: .LBB43_2: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: v_mov_b32_e32 v42, v59 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v53, v58 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: v_mov_b32_e32 v47, v34 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v50 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_mov_b32_e32 v32, v35 +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v33, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v60, v3 +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB43_3: ; %Flow -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mov_b32_e32 v32, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v33, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v53 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v47 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v54, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v36, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v61 ; SI-NEXT: s_cbranch_vccnz .LBB43_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 @@ -66200,83 +65828,72 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 ; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB43_5: ; %end @@ -66306,36 +65923,36 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 ; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 ; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 ; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill @@ -67685,8 +67302,8 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v173, v0 :: v_dual_mov_b32 v174, s29 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 @@ -67709,769 +67326,655 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000 ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 ; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s25, 16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v9, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v1.l -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v1.l +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 ; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 ; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 -; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v167 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v177 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v167 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v1, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v7, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v167 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v176 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v176 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v5, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v177 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v178 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v179 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v6 :: v_dual_add_f32 v3, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v177 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v178 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v178 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v181 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v180 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v4, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v179 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v180 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v180 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v183 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v3, v5, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v182 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v3, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v181 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v182 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v182 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v169 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v5, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v168 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v1.l ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v183 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v168 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v168 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v171 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v3, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v170 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v6 :: v_dual_add_f32 v3, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v168.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v169 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v170 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v170 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v171 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v172 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v170 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v169 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v6 :: v_dual_add_nc_u32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v5, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v171 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v173 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v173 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v4, v6 :: v_dual_add_nc_u32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v6 :: v_dual_lshlrev_b32 v6, 16, v173 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v174 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_add_f32 v5, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v174 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v9, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v172 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v172 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_add_f32 v4, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v173, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v173.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v174 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v174 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v2, v7 :: v_dual_add_nc_u32 v7, v8, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v1.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v3.l ; GFX11-TRUE16-NEXT: .LBB43_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v19, v171 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v172 :: v_dual_mov_b32 v17, v174 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v23, v183 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 ; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 @@ -68580,101 +68083,174 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:184 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:56 +; GFX11-FAKE16-NEXT: s_clause 0xd ; 56-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v185, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v188, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v189, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v190, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v191, s32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v190, v13 :: v_dual_mov_b32 v191, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v67, v9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v179, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, v7 :: v_dual_mov_b32 v183, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v189, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v4 :: v_dual_mov_b32 v185, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v1 :: v_dual_mov_b32 v69, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v0 :: v_dual_mov_b32 v181, s29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v107, s16 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v34, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v140, s2 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:1080 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:1096 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:1112 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1128 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1144 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1160 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1176 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1192 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v144, s3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v114, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s17 :: v_dual_mov_b32 v159, s26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v42, s19 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:952 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:968 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:984 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1000 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1016 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1032 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1048 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1064 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:824 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:840 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:856 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:872 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:888 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:904 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:920 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:936 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s23 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:696 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:712 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:728 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:744 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:760 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:776 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:792 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:808 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, s24 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:632 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:648 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:664 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:680 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s25 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:552 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s27 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:424 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB43_3 ; GFX11-FAKE16-NEXT: .LBB43_2: ; %cmp.true @@ -68682,762 +68258,937 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s24, 16 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v9 :: v_dual_add_nc_u32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:424 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s23, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v159, v0, 16, v1 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s22, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:552 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s21, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:632 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:648 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:664 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:680 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s20, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:696 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:712 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:728 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:744 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:760 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:776 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:792 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:808 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s19, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:824 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:840 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:856 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:872 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:888 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:904 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:920 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:936 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s18, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:952 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:968 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:984 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1000 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1016 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1032 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1048 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1064 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:1080 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:1096 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:1112 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1128 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1144 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1160 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1176 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1192 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v69 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v42, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v114, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 ; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v107, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 ; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v144, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v140, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v4, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v190 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v190 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v1, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_add_nc_u32 v3, v5, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v191 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v7, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v191 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v190, v1, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v3, v5, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v68 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v191, v2, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v7, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v67 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v5, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v179 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v7, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v179 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v68, v1, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v9, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v70 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v8 :: v_dual_and_b32 v6, 0xffff0000, v70 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v2, 16, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v188 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v188 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v6, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v7, v11, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_add_f32 v5, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v183 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v3, v3, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v70, v1, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v180 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v189 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v188, v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v7, 16, v180 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_add_nc_u32 v6, v7, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v189 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_add_nc_u32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v12, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v185 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v10 :: v_dual_add_nc_u32 v5, v11, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v185 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v10 :: v_dual_cndmask_b32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v2, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v10, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v184 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v189, v5, 16, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v182 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v6, v11 :: v_dual_lshlrev_b32 v10, 16, v69 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v181 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v13 :: v_dual_add_nc_u32 v11, v12, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v14, v12 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v16 :: v_dual_add_nc_u32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v13, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v185, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v3, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v11, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v5, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v14, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v7, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v69, v9, 16, v10 ; GFX11-FAKE16-NEXT: .LBB43_3: ; %end -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v34 :: v_dual_mov_b32 v2, v140 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v42 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:1080 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:1096 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:1112 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:1128 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:1144 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:1160 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:1176 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:1192 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v144 :: v_dual_mov_b32 v4, v107 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v76 :: v_dual_mov_b32 v6, v114 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, v184 :: v_dual_mov_b32 v20, v185 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v180 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v183 :: v_dual_mov_b32 v24, v188 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, v32 :: v_dual_mov_b32 v30, v191 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, v190 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, v181 :: v_dual_mov_b32 v18, v182 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v70 :: v_dual_mov_b32 v26, v179 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v67 :: v_dual_mov_b32 v28, v68 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v14, v159 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v69 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v41 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:952 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:968 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:984 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:1000 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:1016 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:1032 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:1048 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:1064 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v42 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:824 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:840 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:856 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:872 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:888 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:904 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:920 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:936 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v43 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:696 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:712 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:728 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:744 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:760 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:776 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:792 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:808 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, v44 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:632 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:648 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:664 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:680 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v45 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:552 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, v46 +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 224-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v191, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v188, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v185, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:92 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:220 +; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:308 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, v48 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB43_4: -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 ; GFX11-FAKE16-NEXT: s_branch .LBB43_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -69478,15 +69229,15 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 @@ -69499,213 +69250,199 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v4 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -69717,28 +69454,27 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v62 ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -69755,89 +69491,99 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: .LBB44_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_f32_e32 v44, 1.0, v63 -; SI-NEXT: v_add_f32_e32 v46, 1.0, v62 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v46 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v44, 1.0, v62 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 ; SI-NEXT: v_mov_b32_e32 v34, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v46 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v19 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v33, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 ; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -69845,37 +69591,42 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 @@ -69898,10 +69649,11 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 @@ -69910,37 +69662,37 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v2 -; SI-NEXT: v_mov_b32_e32 v50, v28 ; SI-NEXT: v_mov_b32_e32 v48, v29 ; SI-NEXT: v_mov_b32_e32 v38, v30 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v56, v8 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v31 +; SI-NEXT: v_mov_b32_e32 v46, v28 +; SI-NEXT: v_mov_b32_e32 v63, v8 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 @@ -69951,27 +69703,25 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 @@ -69980,7 +69730,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 @@ -69989,7 +69739,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 @@ -69998,7 +69748,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -70007,7 +69757,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -70016,7 +69766,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -70025,7 +69775,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -70034,7 +69784,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -70043,7 +69793,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -70052,7 +69802,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -70061,7 +69811,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -70071,7 +69821,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -70082,8 +69832,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -70093,8 +69843,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -70104,8 +69854,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -70115,8 +69865,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -70126,8 +69876,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -70137,8 +69887,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -70148,8 +69898,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -70159,7 +69909,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -70170,8 +69920,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -70180,53 +69930,55 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -70443,148 +70195,148 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v55 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v39 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v15 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v39 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v53 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v32, v55 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v54 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v55 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, v41 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v41 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v42 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v43 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v42 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v39 @@ -70592,7 +70344,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v19 @@ -70605,123 +70357,125 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v41 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v12 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v11 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v58 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v10 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v8 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v43 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v42 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v40 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v29, 1.0, v54 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v42 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v55 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v6, v41 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v35, 1.0, v37 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v29, 1.0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: v_add_f32_e32 v34, 1.0, v38 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v51 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v50 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v49 ; SI-NEXT: v_add_f32_e32 v32, 1.0, v39 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v34, 1.0, v38 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v32 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v34 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 @@ -70734,7 +70488,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 @@ -70744,45 +70498,43 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: .LBB45_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 @@ -70866,11 +70618,18 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -70878,7 +70637,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -70889,7 +70648,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -70900,7 +70659,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -70911,7 +70670,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -70922,7 +70681,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -70933,7 +70692,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -70944,7 +70703,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -70955,7 +70714,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -70966,7 +70725,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -70977,7 +70736,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -70988,7 +70747,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -70999,7 +70758,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -71010,7 +70769,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -71018,19 +70777,21 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -71039,7 +70800,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -71048,15 +70809,6 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -71064,9 +70816,9 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -71091,15 +70843,14 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr36 @@ -71124,70 +70875,71 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 ; SI-NEXT: ; kill: killed $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; kill: killed $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; kill: killed $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v32f32_to_v64f16_scalar: @@ -71856,218 +71608,210 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -72075,27 +71819,37 @@ define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload @@ -72448,22 +72202,23 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v28 ; SI-NEXT: v_mov_b32_e32 v53, v26 -; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v41, v6 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) @@ -72475,40 +72230,40 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 ; SI-NEXT: v_mov_b32_e32 v54, v14 ; SI-NEXT: v_mov_b32_e32 v55, v12 -; SI-NEXT: v_mov_b32_e32 v41, v11 -; SI-NEXT: v_mov_b32_e32 v40, v10 -; SI-NEXT: v_mov_b32_e32 v44, v9 -; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_mov_b32_e32 v44, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 @@ -72519,25 +72274,25 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v48 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v42 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -72549,68 +72304,77 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_or_b32_e32 v19, v54, v19 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_mov_b32_e32 v34, v53 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_mov_b32_e32 v53, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: v_mov_b32_e32 v52, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v51, v22 ; SI-NEXT: v_mov_b32_e32 v51, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 @@ -72632,82 +72396,76 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v27, v38, v27 ; SI-NEXT: v_mov_b32_e32 v38, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v28, v37, v28 ; SI-NEXT: v_mov_b32_e32 v37, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v8, v11, v8 ; SI-NEXT: v_or_b32_e32 v9, v14, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_or_b32_e32 v19, v54, v19 -; SI-NEXT: v_mov_b32_e32 v54, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 -; SI-NEXT: v_or_b32_e32 v2, v60, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_or_b32_e32 v12, v61, v12 ; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_or_b32_e32 v13, v59, v13 ; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_or_b32_e32 v13, v57, v13 ; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_or_b32_e32 v14, v47, v14 ; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v45, v15 ; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v15, v43, v15 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v40, v17 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_or_b32_e32 v18, v55, v18 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_mov_b32_e32 v42, v55 +; SI-NEXT: v_or_b32_e32 v17, v55, v17 +; SI-NEXT: v_mov_b32_e32 v33, v40 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 ; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_mov_b32_e32 v32, v35 +; SI-NEXT: v_or_b32_e32 v31, v63, v31 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB47_3 ; SI-NEXT: .LBB47_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v62, v61 ; SI-NEXT: v_mov_b32_e32 v60, v59 ; SI-NEXT: v_mov_b32_e32 v58, v57 ; SI-NEXT: v_mov_b32_e32 v56, v47 ; SI-NEXT: v_mov_b32_e32 v46, v45 ; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v42, v55 +; SI-NEXT: v_mov_b32_e32 v33, v40 +; SI-NEXT: v_mov_b32_e32 v36, v54 ; SI-NEXT: v_mov_b32_e32 v54, v20 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v34, v53 +; SI-NEXT: v_mov_b32_e32 v32, v35 +; SI-NEXT: v_mov_b32_e32 v53, v21 +; SI-NEXT: v_mov_b32_e32 v52, v22 ; SI-NEXT: v_mov_b32_e32 v51, v23 ; SI-NEXT: v_mov_b32_e32 v50, v24 ; SI-NEXT: v_mov_b32_e32 v49, v25 @@ -72718,298 +72476,290 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB47_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v35, v40 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v40, v46 -; SI-NEXT: v_mov_b32_e32 v41, v56 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v43, v60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v63 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v42, v56 +; SI-NEXT: v_mov_b32_e32 v43, v58 +; SI-NEXT: v_mov_b32_e32 v44, v60 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB47_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v47 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_mov_b32_e32 v55, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v55 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v34 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v58 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v59 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v35 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 @@ -73022,7 +72772,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 @@ -73039,7 +72789,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 ; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -73047,7 +72797,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 @@ -73077,7 +72827,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v32f32_scalar: @@ -73308,252 +73058,214 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v160, v13 :: v_dual_mov_b32 v161, v12 +; GFX11-NEXT: v_dual_mov_b32 v162, v11 :: v_dual_mov_b32 v163, v10 +; GFX11-NEXT: v_dual_mov_b32 v164, v9 :: v_dual_mov_b32 v165, v8 +; GFX11-NEXT: v_dual_mov_b32 v166, v7 :: v_dual_mov_b32 v167, v6 +; GFX11-NEXT: v_dual_mov_b32 v176, v5 :: v_dual_mov_b32 v177, v4 +; GFX11-NEXT: v_dual_mov_b32 v178, v3 :: v_dual_mov_b32 v179, v2 +; GFX11-NEXT: v_dual_mov_b32 v180, v1 :: v_dual_mov_b32 v181, v0 +; GFX11-NEXT: v_dual_mov_b32 v182, s28 :: v_dual_mov_b32 v183, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:96 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 ; GFX11-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v27, s18 +; GFX11-NEXT: v_dual_mov_b32 v20, s17 :: v_dual_mov_b32 v35, s19 +; GFX11-NEXT: v_dual_mov_b32 v44, s20 :: v_dual_mov_b32 v65, s22 +; GFX11-NEXT: v_dual_mov_b32 v54, s21 :: v_dual_mov_b32 v77, s23 +; GFX11-NEXT: v_dual_mov_b32 v90, s24 :: v_dual_mov_b32 v119, s26 +; GFX11-NEXT: v_dual_mov_b32 v104, s25 :: v_dual_mov_b32 v135, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB47_3 ; GFX11-NEXT: .LBB47_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v135, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v119, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v104, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v90, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v77, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v65, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v54, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v44, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v35, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v160, 0x200, v160 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v161, 0x200, v161 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v162, 0x200, v162 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v163, 0x200, v163 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v164, 0x200, v164 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v165, 0x200, v165 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v166, 0x200, v166 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v167, 0x200, v167 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB47_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 +; GFX11-NEXT: v_mov_b32_e32 v13, v104 ; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:220 +; GFX11-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14 +; GFX11-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 +; GFX11-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65 +; GFX11-NEXT: v_mov_b32_e32 v14, v119 +; GFX11-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v182 +; GFX11-NEXT: v_dual_mov_b32 v17, v183 :: v_dual_mov_b32 v18, v181 +; GFX11-NEXT: v_dual_mov_b32 v19, v180 :: v_dual_mov_b32 v20, v179 +; GFX11-NEXT: v_dual_mov_b32 v21, v178 :: v_dual_mov_b32 v22, v177 +; GFX11-NEXT: v_dual_mov_b32 v23, v176 :: v_dual_mov_b32 v24, v167 +; GFX11-NEXT: v_dual_mov_b32 v25, v166 :: v_dual_mov_b32 v26, v165 +; GFX11-NEXT: v_dual_mov_b32 v27, v164 :: v_dual_mov_b32 v28, v163 +; GFX11-NEXT: v_dual_mov_b32 v29, v162 :: v_dual_mov_b32 v30, v161 +; GFX11-NEXT: v_mov_b32_e32 v31, v160 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB47_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 ; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109 +; GFX11-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122 +; GFX11-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136 +; GFX11-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151 ; GFX11-NEXT: s_branch .LBB47_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -74120,6 +73832,7 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; SI-NEXT: v_mov_b32_e32 v31, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v32, s17 ; SI-NEXT: v_mov_b32_e32 v29, s18 ; SI-NEXT: v_mov_b32_e32 v30, s19 @@ -74131,7 +73844,6 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v24, s25 ; SI-NEXT: v_mov_b32_e32 v21, s26 ; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v19, s28 ; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -74152,293 +73864,293 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[36:37], v[15:16], 16 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshr_b64 v[56:57], v[11:12], 16 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshr_b64 v[57:58], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[21:22], 16 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshr_b64 v[58:59], v[7:8], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[21:22], 16 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[34:35], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[59:60], v[5:6], 16 +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[39:40], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[60:61], v[3:4], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[53:54], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[29:30], 16 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[61:62], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[36:37], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[7:8], 16 -; SI-NEXT: v_mov_b32_e32 v53, v40 -; SI-NEXT: v_lshr_b64 v[39:40], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[40:41], v[25:26], 16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshr_b64 v[54:55], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[41:42], v[27:28], 16 -; SI-NEXT: v_mov_b32_e32 v55, v48 -; SI-NEXT: v_lshr_b64 v[48:49], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[42:43], v[29:30], 16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v32 -; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[43:44], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[62:63], v[19:20], 16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v32 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_lshr_b64 v[36:37], v[11:12], 16 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[7:8], 16 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_lshr_b64 v[39:40], v[23:24], 16 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_lshr_b64 v[56:57], v[11:12], 16 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[57:58], v[9:10], 16 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[40:41], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[58:59], v[7:8], 16 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[34:35], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[59:60], v[5:6], 16 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[33:34], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[41:42], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[60:61], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[39:40], v[27:28], 16 ; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[42:43], v[29:30], 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[54:55], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[43:44], v[31:32], 16 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: v_lshr_b64 v[61:62], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[62:63], v[19:20], 16 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[54:55], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v32 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v43 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v50 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_or_b32_e32 v31, v31, v41 ; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v55 ; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v42 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v40 ; SI-NEXT: v_or_b32_e32 v29, v29, v31 ; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v33 ; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v41 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v39 ; SI-NEXT: v_or_b32_e32 v27, v27, v29 ; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v43 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v54 ; SI-NEXT: v_or_b32_e32 v25, v25, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v52 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v39 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v53 ; SI-NEXT: v_or_b32_e32 v23, v23, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v21, v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v62 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v61 ; SI-NEXT: v_or_b32_e32 v1, v1, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -74453,7 +74165,7 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -74468,7 +74180,7 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -74491,44 +74203,44 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v32f32_to_v64i16_scalar: @@ -74824,10 +74536,10 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(13) @@ -75047,8 +74759,8 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v11, v11, v44 ; SI-NEXT: v_or_b32_e32 v12, v12, v43 ; SI-NEXT: v_or_b32_e32 v13, v13, v42 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v55 +; SI-NEXT: v_or_b32_e32 v14, v14, v55 +; SI-NEXT: v_or_b32_e32 v15, v15, v40 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr52 @@ -75069,8 +74781,8 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 @@ -75248,8 +74960,8 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v11, v44, v11 ; SI-NEXT: v_or_b32_e32 v12, v43, v12 ; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v55, v15 +; SI-NEXT: v_or_b32_e32 v14, v55, v14 +; SI-NEXT: v_or_b32_e32 v15, v40, v15 ; SI-NEXT: v_or_b32_e32 v19, v39, v19 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -75580,222 +75292,207 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v12 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v35, v8 -; SI-NEXT: v_mov_b32_e32 v38, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_mov_b32_e32 v46, v10 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v32 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v55 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v54 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB51_2 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v7, v0, v48 +; SI-NEXT: v_mov_b32_e32 v60, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v8, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v39 +; SI-NEXT: v_or_b32_e32 v9, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v11, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v12, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v13, v0, v44 +; SI-NEXT: v_or_b32_e32 v10, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v13, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v43 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v16, v0, v42 +; SI-NEXT: v_or_b32_e32 v16, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v18, v0, v41 +; SI-NEXT: v_or_b32_e32 v18, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v19, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v20, v0, v37 +; SI-NEXT: v_or_b32_e32 v20, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v22, v0, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 ; SI-NEXT: v_or_b32_e32 v24, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 ; SI-NEXT: v_or_b32_e32 v27, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v28, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v29, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: s_or_b32 s12, s4, s5 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_or_b32_e32 v8, v1, v56 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v55, v61 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: v_or_b32_e32 v31, v0, v31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_branch .LBB51_3 -; SI-NEXT: .LBB51_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB51_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v58, v49 -; SI-NEXT: s_cbranch_vccnz .LBB51_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_mov_b32_e32 v54, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v48 +; SI-NEXT: v_mov_b32_e32 v61, v39 +; SI-NEXT: v_mov_b32_e32 v59, v37 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v53, v35 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v63, v56 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -75840,139 +75537,134 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -75981,7 +75673,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB51_5: ; %end +; SI-NEXT: .LBB51_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -75998,8 +75690,26 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB51_4: +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v48 +; SI-NEXT: v_mov_b32_e32 v61, v39 +; SI-NEXT: v_mov_b32_e32 v60, v0 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v59, v37 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v54, v4 +; SI-NEXT: v_mov_b32_e32 v53, v35 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v63, v56 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v64i16_to_v32f32_scalar: ; VI: ; %bb.0: @@ -76336,252 +76046,214 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v160, v13 :: v_dual_mov_b32 v161, v12 +; GFX11-NEXT: v_dual_mov_b32 v162, v11 :: v_dual_mov_b32 v163, v10 +; GFX11-NEXT: v_dual_mov_b32 v164, v9 :: v_dual_mov_b32 v165, v8 +; GFX11-NEXT: v_dual_mov_b32 v166, v7 :: v_dual_mov_b32 v167, v6 +; GFX11-NEXT: v_dual_mov_b32 v176, v5 :: v_dual_mov_b32 v177, v4 +; GFX11-NEXT: v_dual_mov_b32 v178, v3 :: v_dual_mov_b32 v179, v2 +; GFX11-NEXT: v_dual_mov_b32 v180, v1 :: v_dual_mov_b32 v181, v0 +; GFX11-NEXT: v_dual_mov_b32 v182, s28 :: v_dual_mov_b32 v183, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:96 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 ; GFX11-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v27, s18 +; GFX11-NEXT: v_dual_mov_b32 v20, s17 :: v_dual_mov_b32 v35, s19 +; GFX11-NEXT: v_dual_mov_b32 v44, s20 :: v_dual_mov_b32 v65, s22 +; GFX11-NEXT: v_dual_mov_b32 v54, s21 :: v_dual_mov_b32 v77, s23 +; GFX11-NEXT: v_dual_mov_b32 v90, s24 :: v_dual_mov_b32 v119, s26 +; GFX11-NEXT: v_dual_mov_b32 v104, s25 :: v_dual_mov_b32 v135, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB51_3 ; GFX11-NEXT: .LBB51_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v135, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v119, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v104, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v90, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v77, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v65, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v54, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v44, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v35, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v160, v160, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v161, v161, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v162, v162, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v163, v163, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v164, v164, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v165, v165, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v166, v166, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v167, v167, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB51_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 +; GFX11-NEXT: v_mov_b32_e32 v13, v104 ; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:220 +; GFX11-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14 +; GFX11-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 +; GFX11-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65 +; GFX11-NEXT: v_mov_b32_e32 v14, v119 +; GFX11-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v182 +; GFX11-NEXT: v_dual_mov_b32 v17, v183 :: v_dual_mov_b32 v18, v181 +; GFX11-NEXT: v_dual_mov_b32 v19, v180 :: v_dual_mov_b32 v20, v179 +; GFX11-NEXT: v_dual_mov_b32 v21, v178 :: v_dual_mov_b32 v22, v177 +; GFX11-NEXT: v_dual_mov_b32 v23, v176 :: v_dual_mov_b32 v24, v167 +; GFX11-NEXT: v_dual_mov_b32 v25, v166 :: v_dual_mov_b32 v26, v165 +; GFX11-NEXT: v_dual_mov_b32 v27, v164 :: v_dual_mov_b32 v28, v163 +; GFX11-NEXT: v_dual_mov_b32 v29, v162 :: v_dual_mov_b32 v30, v161 +; GFX11-NEXT: v_mov_b32_e32 v31, v160 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB51_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 ; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109 +; GFX11-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122 +; GFX11-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136 +; GFX11-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151 ; GFX11-NEXT: s_branch .LBB51_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -77335,6 +77007,7 @@ define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 @@ -77345,7 +77018,6 @@ define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB55_4 @@ -77399,6 +77071,7 @@ define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 @@ -77409,7 +77082,6 @@ define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB55_4 @@ -77463,6 +77135,7 @@ define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 @@ -77473,7 +77146,6 @@ define inreg <16 x i64> @bitcast_v16f64_to_v16i64_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB55_4 @@ -78786,10 +78458,68 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -78801,7 +78531,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -78813,6 +78544,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -78823,6 +78557,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -78833,6 +78570,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -78843,6 +78583,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -78853,6 +78596,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -78863,6 +78609,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -78873,6 +78622,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -78883,278 +78635,203 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB56_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[23:24] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[21:22] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[15:16] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[13:14] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[11:12] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 -; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_mov_b32_e32 v41, v38 +; VI-NEXT: v_lshrrev_b64 v[38:39], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; VI-NEXT: v_mov_b32_e32 v39, v55 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v1 ; VI-NEXT: .LBB56_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB56_4 @@ -79227,236 +78904,240 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[38:39], 24, v[1:2] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v32 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 -; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v1 ; VI-NEXT: .LBB56_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v48 -; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v57 -; VI-NEXT: v_or_b32_sdwa v2, v2, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v41 -; VI-NEXT: v_or_b32_sdwa v48, v53, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v55 +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v38 +; VI-NEXT: v_or_b32_sdwa v55, v60, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v53 +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38 -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v36 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v34 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 -; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -79467,23 +79148,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -79494,23 +79174,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -79521,23 +79200,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -79548,23 +79226,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -79575,10 +79252,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -79588,9 +79265,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -79601,10 +79278,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -79615,9 +79292,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -79628,10 +79305,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -79642,9 +79319,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -79655,21 +79332,23 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -79680,10 +79359,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -79694,9 +79373,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -79707,10 +79386,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -79721,14 +79400,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v42 ; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -79772,393 +79449,388 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; kill: killed $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; kill: killed $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; kill: killed $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; kill: killed $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(45) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB56_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(47) +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(44) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v42, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v1 ; GFX9-NEXT: .LBB56_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB56_4 @@ -80193,7 +79865,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, 0, v28, vcc ; GFX9-NEXT: v_add_co_u32_e32 v29, vcc, 3, v29 ; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, 0, v30, vcc -; GFX9-NEXT: s_waitcnt vmcnt(44) +; GFX9-NEXT: s_waitcnt vmcnt(42) ; GFX9-NEXT: v_add_co_u32_e32 v31, vcc, 3, v31 ; GFX9-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v32, vcc ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -80248,339 +79920,326 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v1 ; GFX9-NEXT: .LBB56_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v52 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v36 -; GFX9-NEXT: v_or_b32_sdwa v12, v12, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v50 -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v57 -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v56 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v38 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v47 -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v46 -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v55 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v54 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v41 -; GFX9-NEXT: v_or_b32_sdwa v33, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v54, v54, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 -; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -80590,11 +80249,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -80603,10 +80262,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -80616,11 +80275,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -80629,10 +80288,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -80642,11 +80301,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -80655,10 +80314,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -80668,11 +80327,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -80681,10 +80340,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -80694,11 +80353,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -80707,10 +80366,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -80720,11 +80379,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -80733,15 +80392,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v42 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -81029,17 +80685,17 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v161.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l @@ -81233,28 +80889,27 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 @@ -81340,6 +80995,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 @@ -81348,7 +81004,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 @@ -81357,18 +81012,19 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 @@ -81450,10 +81106,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: .LBB56_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB56_4 @@ -81595,56 +81250,52 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: .LBB56_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v63 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v61 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v66 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v67, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v65 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v61 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v62 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v67, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v59 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v58 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v57 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v56 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v47 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v46 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v45 @@ -81653,22 +81304,26 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v64 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v65, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v44 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v64 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v65, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v42 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v41 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v40 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v183 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 @@ -81898,27 +81553,26 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:84 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -81971,10 +81625,10 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v21, s68, 20 ; SI-NEXT: v_mov_b32_e32 v20, s16 ; SI-NEXT: v_writelane_b32 v21, s69, 21 -; SI-NEXT: v_readfirstlane_b32 s56, v20 +; SI-NEXT: v_readfirstlane_b32 s58, v20 ; SI-NEXT: v_mov_b32_e32 v20, s17 ; SI-NEXT: v_writelane_b32 v21, s70, 22 -; SI-NEXT: v_readfirstlane_b32 s57, v20 +; SI-NEXT: v_readfirstlane_b32 s59, v20 ; SI-NEXT: v_mov_b32_e32 v20, s18 ; SI-NEXT: v_writelane_b32 v21, s71, 23 ; SI-NEXT: v_readfirstlane_b32 s46, v20 @@ -82036,159 +81690,159 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s26, s5, 24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v23, s26, 40 ; SI-NEXT: s_lshr_b32 s26, s5, 16 -; SI-NEXT: v_writelane_b32 v23, s26, 41 -; SI-NEXT: s_lshr_b32 s26, s5, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 42 -; SI-NEXT: s_lshr_b32 s26, s7, 24 -; SI-NEXT: v_writelane_b32 v23, s26, 43 -; SI-NEXT: s_lshr_b32 s26, s7, 16 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v23, s26, 44 -; SI-NEXT: s_lshr_b32 s26, s7, 8 +; SI-NEXT: s_lshr_b32 s26, s5, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 45 -; SI-NEXT: s_lshr_b32 s26, s9, 24 +; SI-NEXT: s_lshr_b32 s26, s7, 24 ; SI-NEXT: v_writelane_b32 v23, s26, 46 -; SI-NEXT: s_lshr_b32 s26, s9, 16 +; SI-NEXT: s_lshr_b32 s26, s7, 16 ; SI-NEXT: v_writelane_b32 v23, s26, 47 -; SI-NEXT: s_lshr_b32 s26, s9, 8 +; SI-NEXT: s_lshr_b32 s26, s7, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 48 -; SI-NEXT: s_lshr_b32 s26, s11, 24 +; SI-NEXT: s_lshr_b32 s26, s9, 24 ; SI-NEXT: v_writelane_b32 v23, s26, 49 -; SI-NEXT: s_lshr_b32 s26, s11, 16 +; SI-NEXT: s_lshr_b32 s26, s9, 16 ; SI-NEXT: v_writelane_b32 v23, s26, 50 -; SI-NEXT: s_lshr_b32 s26, s11, 8 +; SI-NEXT: s_lshr_b32 s26, s9, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 51 -; SI-NEXT: s_lshr_b32 s26, s13, 24 +; SI-NEXT: s_lshr_b32 s26, s11, 24 ; SI-NEXT: v_writelane_b32 v23, s26, 52 -; SI-NEXT: s_lshr_b32 s26, s13, 16 +; SI-NEXT: s_lshr_b32 s26, s11, 16 ; SI-NEXT: v_writelane_b32 v23, s26, 53 -; SI-NEXT: s_lshr_b32 s26, s13, 8 +; SI-NEXT: s_lshr_b32 s26, s11, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 54 -; SI-NEXT: s_lshr_b32 s26, s15, 24 +; SI-NEXT: s_lshr_b32 s26, s13, 24 ; SI-NEXT: v_writelane_b32 v23, s26, 55 -; SI-NEXT: s_lshr_b32 s26, s15, 16 +; SI-NEXT: s_lshr_b32 s26, s13, 16 ; SI-NEXT: v_writelane_b32 v23, s26, 56 -; SI-NEXT: s_lshr_b32 s26, s15, 8 +; SI-NEXT: s_lshr_b32 s26, s13, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 57 -; SI-NEXT: s_lshr_b32 s26, s17, 24 +; SI-NEXT: s_lshr_b32 s26, s15, 24 ; SI-NEXT: v_writelane_b32 v23, s26, 58 -; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b32 s26, s15, 16 ; SI-NEXT: v_writelane_b32 v23, s26, 59 -; SI-NEXT: s_lshr_b32 s26, s17, 8 +; SI-NEXT: s_lshr_b32 s26, s15, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 60 -; SI-NEXT: s_lshr_b32 s26, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s17, 24 ; SI-NEXT: v_writelane_b32 v23, s26, 61 -; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 ; SI-NEXT: v_writelane_b32 v23, s26, 62 -; SI-NEXT: s_lshr_b32 s26, s19, 8 +; SI-NEXT: s_lshr_b32 s26, s17, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 63 -; SI-NEXT: s_lshr_b32 s26, s21, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 24 ; SI-NEXT: v_writelane_b32 v22, s26, 0 -; SI-NEXT: s_lshr_b32 s26, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s19, 16 ; SI-NEXT: v_writelane_b32 v22, s26, 1 -; SI-NEXT: s_lshr_b32 s26, s21, 8 +; SI-NEXT: s_lshr_b32 s26, s19, 8 ; SI-NEXT: v_writelane_b32 v22, s26, 2 -; SI-NEXT: s_lshr_b32 s26, s23, 24 +; SI-NEXT: s_lshr_b32 s26, s21, 24 ; SI-NEXT: v_writelane_b32 v22, s26, 3 -; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: s_lshr_b32 s26, s21, 16 ; SI-NEXT: v_writelane_b32 v22, s26, 4 -; SI-NEXT: s_lshr_b32 s26, s23, 8 +; SI-NEXT: s_lshr_b32 s26, s21, 8 ; SI-NEXT: v_writelane_b32 v22, s26, 5 -; SI-NEXT: s_lshr_b32 s26, s25, 24 +; SI-NEXT: s_lshr_b32 s26, s23, 24 ; SI-NEXT: v_writelane_b32 v22, s26, 6 -; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s26, s23, 16 ; SI-NEXT: v_writelane_b32 v22, s26, 7 -; SI-NEXT: s_lshr_b32 s26, s25, 8 +; SI-NEXT: s_lshr_b32 s26, s23, 8 ; SI-NEXT: v_writelane_b32 v22, s26, 8 -; SI-NEXT: s_lshr_b32 s26, s41, 24 +; SI-NEXT: s_lshr_b32 s26, s25, 24 ; SI-NEXT: v_writelane_b32 v22, s26, 9 -; SI-NEXT: s_lshr_b32 s26, s41, 16 +; SI-NEXT: s_lshr_b32 s26, s25, 16 ; SI-NEXT: v_writelane_b32 v22, s26, 10 -; SI-NEXT: s_lshr_b32 s26, s41, 8 +; SI-NEXT: s_lshr_b32 s26, s25, 8 ; SI-NEXT: v_writelane_b32 v22, s26, 11 -; SI-NEXT: s_lshr_b32 s26, s43, 24 +; SI-NEXT: s_lshr_b32 s26, s41, 24 ; SI-NEXT: v_writelane_b32 v22, s26, 12 -; SI-NEXT: s_lshr_b32 s26, s43, 16 +; SI-NEXT: s_lshr_b32 s26, s41, 16 ; SI-NEXT: v_writelane_b32 v22, s26, 13 -; SI-NEXT: s_lshr_b32 s26, s43, 8 +; SI-NEXT: s_lshr_b32 s26, s41, 8 ; SI-NEXT: v_writelane_b32 v22, s26, 14 -; SI-NEXT: s_lshr_b32 s26, s45, 24 +; SI-NEXT: s_lshr_b32 s26, s43, 24 ; SI-NEXT: v_writelane_b32 v22, s26, 15 -; SI-NEXT: s_lshr_b32 s26, s45, 16 +; SI-NEXT: s_lshr_b32 s26, s43, 16 ; SI-NEXT: v_writelane_b32 v22, s26, 16 -; SI-NEXT: s_lshr_b32 s26, s45, 8 +; SI-NEXT: s_lshr_b32 s26, s43, 8 ; SI-NEXT: v_writelane_b32 v22, s26, 17 ; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 42 +; SI-NEXT: v_writelane_b32 v23, s27, 43 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 40 +; SI-NEXT: v_writelane_b32 v23, s27, 41 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 38 ; SI-NEXT: v_writelane_b32 v23, s27, 39 -; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 36 ; SI-NEXT: v_writelane_b32 v23, s27, 37 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 34 ; SI-NEXT: v_writelane_b32 v23, s27, 35 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 32 ; SI-NEXT: v_writelane_b32 v23, s27, 33 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 30 ; SI-NEXT: v_writelane_b32 v23, s27, 31 -; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 28 ; SI-NEXT: v_writelane_b32 v23, s27, 29 -; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 26 ; SI-NEXT: v_writelane_b32 v23, s27, 27 -; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 24 ; SI-NEXT: v_writelane_b32 v23, s27, 25 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 22 ; SI-NEXT: v_writelane_b32 v23, s27, 23 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 20 ; SI-NEXT: v_writelane_b32 v23, s27, 21 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 18 ; SI-NEXT: v_writelane_b32 v23, s27, 19 -; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 16 ; SI-NEXT: v_writelane_b32 v23, s27, 17 -; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 14 ; SI-NEXT: v_writelane_b32 v23, s27, 15 -; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 12 ; SI-NEXT: v_writelane_b32 v23, s27, 13 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 10 ; SI-NEXT: v_writelane_b32 v23, s27, 11 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 8 ; SI-NEXT: v_writelane_b32 v23, s27, 9 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 6 ; SI-NEXT: v_writelane_b32 v23, s27, 7 -; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 4 ; SI-NEXT: v_writelane_b32 v23, s27, 5 -; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 2 ; SI-NEXT: v_writelane_b32 v23, s27, 3 -; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 0 +; SI-NEXT: s_lshr_b32 s65, s5, 24 +; SI-NEXT: s_lshr_b32 s51, s45, 24 +; SI-NEXT: s_lshr_b32 s52, s45, 16 +; SI-NEXT: s_lshr_b32 s53, s45, 8 ; SI-NEXT: s_lshr_b32 s49, s47, 24 -; SI-NEXT: s_lshr_b32 s48, s47, 16 -; SI-NEXT: s_lshr_b32 s50, s47, 8 -; SI-NEXT: s_lshr_b32 s51, s57, 24 -; SI-NEXT: s_lshr_b32 s52, s57, 16 -; SI-NEXT: s_lshr_b32 s53, s57, 8 -; SI-NEXT: s_lshr_b64 s[54:55], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s54, s47, 16 +; SI-NEXT: s_lshr_b32 s55, s47, 8 +; SI-NEXT: s_lshr_b32 s48, s59, 24 +; SI-NEXT: s_lshr_b32 s50, s59, 16 +; SI-NEXT: s_lshr_b32 s64, s59, 8 ; SI-NEXT: v_writelane_b32 v23, s27, 1 -; SI-NEXT: s_lshr_b64 s[64:65], s[18:19], 24 ; SI-NEXT: s_lshr_b64 s[66:67], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[68:69], s[18:19], 8 ; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 24 @@ -82200,7 +81854,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[98:99], s[24:25], 24 ; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 ; SI-NEXT: s_lshr_b64 s[28:29], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[58:59], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[56:57], s[40:41], 24 ; SI-NEXT: s_lshr_b64 s[60:61], s[40:41], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 8 ; SI-NEXT: s_lshr_b64 s[72:73], s[42:43], 24 @@ -82212,9 +81866,9 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[92:93], s[46:47], 24 ; SI-NEXT: s_lshr_b64 s[94:95], s[46:47], 16 ; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[56:57], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[56:57], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[58:59], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[58:59], 8 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s4, 3 @@ -82247,160 +81901,160 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_addc_u32 s45, s45, 0 ; SI-NEXT: s_add_u32 s46, s46, 3 ; SI-NEXT: s_addc_u32 s47, s47, 0 -; SI-NEXT: s_add_u32 s56, s56, 3 -; SI-NEXT: s_addc_u32 s57, s57, 0 -; SI-NEXT: s_lshr_b32 s26, s5, 24 -; SI-NEXT: v_writelane_b32 v23, s26, 40 +; SI-NEXT: s_add_u32 s58, s58, 3 +; SI-NEXT: s_addc_u32 s59, s59, 0 ; SI-NEXT: s_lshr_b32 s26, s5, 16 -; SI-NEXT: v_writelane_b32 v23, s26, 41 -; SI-NEXT: s_lshr_b32 s26, s5, 8 -; SI-NEXT: v_writelane_b32 v23, s26, 42 -; SI-NEXT: s_lshr_b32 s26, s7, 24 -; SI-NEXT: v_writelane_b32 v23, s26, 43 -; SI-NEXT: s_lshr_b32 s26, s7, 16 ; SI-NEXT: v_writelane_b32 v23, s26, 44 -; SI-NEXT: s_lshr_b32 s26, s7, 8 +; SI-NEXT: s_lshr_b32 s26, s5, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 45 -; SI-NEXT: s_lshr_b32 s26, s9, 24 +; SI-NEXT: s_lshr_b32 s26, s7, 24 ; SI-NEXT: v_writelane_b32 v23, s26, 46 -; SI-NEXT: s_lshr_b32 s26, s9, 16 +; SI-NEXT: s_lshr_b32 s26, s7, 16 ; SI-NEXT: v_writelane_b32 v23, s26, 47 -; SI-NEXT: s_lshr_b32 s26, s9, 8 +; SI-NEXT: s_lshr_b32 s26, s7, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 48 -; SI-NEXT: s_lshr_b32 s26, s11, 24 +; SI-NEXT: s_lshr_b32 s26, s9, 24 ; SI-NEXT: v_writelane_b32 v23, s26, 49 -; SI-NEXT: s_lshr_b32 s26, s11, 16 +; SI-NEXT: s_lshr_b32 s26, s9, 16 ; SI-NEXT: v_writelane_b32 v23, s26, 50 -; SI-NEXT: s_lshr_b32 s26, s11, 8 +; SI-NEXT: s_lshr_b32 s26, s9, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 51 -; SI-NEXT: s_lshr_b32 s26, s13, 24 +; SI-NEXT: s_lshr_b32 s26, s11, 24 ; SI-NEXT: v_writelane_b32 v23, s26, 52 -; SI-NEXT: s_lshr_b32 s26, s13, 16 +; SI-NEXT: s_lshr_b32 s26, s11, 16 ; SI-NEXT: v_writelane_b32 v23, s26, 53 -; SI-NEXT: s_lshr_b32 s26, s13, 8 +; SI-NEXT: s_lshr_b32 s26, s11, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 54 -; SI-NEXT: s_lshr_b32 s26, s15, 24 +; SI-NEXT: s_lshr_b32 s26, s13, 24 ; SI-NEXT: v_writelane_b32 v23, s26, 55 -; SI-NEXT: s_lshr_b32 s26, s15, 16 +; SI-NEXT: s_lshr_b32 s26, s13, 16 ; SI-NEXT: v_writelane_b32 v23, s26, 56 -; SI-NEXT: s_lshr_b32 s26, s15, 8 +; SI-NEXT: s_lshr_b32 s26, s13, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 57 -; SI-NEXT: s_lshr_b32 s26, s17, 24 +; SI-NEXT: s_lshr_b32 s26, s15, 24 ; SI-NEXT: v_writelane_b32 v23, s26, 58 -; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b32 s26, s15, 16 ; SI-NEXT: v_writelane_b32 v23, s26, 59 -; SI-NEXT: s_lshr_b32 s26, s17, 8 +; SI-NEXT: s_lshr_b32 s26, s15, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 60 -; SI-NEXT: s_lshr_b32 s26, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s17, 24 ; SI-NEXT: v_writelane_b32 v23, s26, 61 -; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 ; SI-NEXT: v_writelane_b32 v23, s26, 62 -; SI-NEXT: s_lshr_b32 s26, s19, 8 +; SI-NEXT: s_lshr_b32 s26, s17, 8 ; SI-NEXT: v_writelane_b32 v23, s26, 63 -; SI-NEXT: s_lshr_b32 s26, s21, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 24 ; SI-NEXT: v_writelane_b32 v22, s26, 0 -; SI-NEXT: s_lshr_b32 s26, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s19, 16 ; SI-NEXT: v_writelane_b32 v22, s26, 1 -; SI-NEXT: s_lshr_b32 s26, s21, 8 +; SI-NEXT: s_lshr_b32 s26, s19, 8 ; SI-NEXT: v_writelane_b32 v22, s26, 2 -; SI-NEXT: s_lshr_b32 s26, s23, 24 +; SI-NEXT: s_lshr_b32 s26, s21, 24 ; SI-NEXT: v_writelane_b32 v22, s26, 3 -; SI-NEXT: s_lshr_b32 s26, s23, 16 +; SI-NEXT: s_lshr_b32 s26, s21, 16 ; SI-NEXT: v_writelane_b32 v22, s26, 4 -; SI-NEXT: s_lshr_b32 s26, s23, 8 +; SI-NEXT: s_lshr_b32 s26, s21, 8 ; SI-NEXT: v_writelane_b32 v22, s26, 5 -; SI-NEXT: s_lshr_b32 s26, s25, 24 +; SI-NEXT: s_lshr_b32 s26, s23, 24 ; SI-NEXT: v_writelane_b32 v22, s26, 6 -; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s26, s23, 16 ; SI-NEXT: v_writelane_b32 v22, s26, 7 -; SI-NEXT: s_lshr_b32 s26, s25, 8 +; SI-NEXT: s_lshr_b32 s26, s23, 8 ; SI-NEXT: v_writelane_b32 v22, s26, 8 -; SI-NEXT: s_lshr_b32 s26, s41, 24 +; SI-NEXT: s_lshr_b32 s26, s25, 24 ; SI-NEXT: v_writelane_b32 v22, s26, 9 -; SI-NEXT: s_lshr_b32 s26, s41, 16 +; SI-NEXT: s_lshr_b32 s26, s25, 16 ; SI-NEXT: v_writelane_b32 v22, s26, 10 -; SI-NEXT: s_lshr_b32 s26, s41, 8 +; SI-NEXT: s_lshr_b32 s26, s25, 8 ; SI-NEXT: v_writelane_b32 v22, s26, 11 -; SI-NEXT: s_lshr_b32 s26, s43, 24 +; SI-NEXT: s_lshr_b32 s26, s41, 24 ; SI-NEXT: v_writelane_b32 v22, s26, 12 -; SI-NEXT: s_lshr_b32 s26, s43, 16 +; SI-NEXT: s_lshr_b32 s26, s41, 16 ; SI-NEXT: v_writelane_b32 v22, s26, 13 -; SI-NEXT: s_lshr_b32 s26, s43, 8 +; SI-NEXT: s_lshr_b32 s26, s41, 8 ; SI-NEXT: v_writelane_b32 v22, s26, 14 -; SI-NEXT: s_lshr_b32 s26, s45, 24 +; SI-NEXT: s_lshr_b32 s26, s43, 24 ; SI-NEXT: v_writelane_b32 v22, s26, 15 -; SI-NEXT: s_lshr_b32 s26, s45, 16 +; SI-NEXT: s_lshr_b32 s26, s43, 16 ; SI-NEXT: v_writelane_b32 v22, s26, 16 -; SI-NEXT: s_lshr_b32 s26, s45, 8 +; SI-NEXT: s_lshr_b32 s26, s43, 8 ; SI-NEXT: v_writelane_b32 v22, s26, 17 ; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v23, s26, 42 +; SI-NEXT: v_writelane_b32 v23, s27, 43 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v23, s26, 40 +; SI-NEXT: v_writelane_b32 v23, s27, 41 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 38 ; SI-NEXT: v_writelane_b32 v23, s27, 39 -; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 36 ; SI-NEXT: v_writelane_b32 v23, s27, 37 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 34 ; SI-NEXT: v_writelane_b32 v23, s27, 35 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 32 ; SI-NEXT: v_writelane_b32 v23, s27, 33 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 30 ; SI-NEXT: v_writelane_b32 v23, s27, 31 -; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 28 ; SI-NEXT: v_writelane_b32 v23, s27, 29 -; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 26 ; SI-NEXT: v_writelane_b32 v23, s27, 27 -; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 24 ; SI-NEXT: v_writelane_b32 v23, s27, 25 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 22 ; SI-NEXT: v_writelane_b32 v23, s27, 23 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 20 ; SI-NEXT: v_writelane_b32 v23, s27, 21 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 18 ; SI-NEXT: v_writelane_b32 v23, s27, 19 -; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 16 ; SI-NEXT: v_writelane_b32 v23, s27, 17 -; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 14 ; SI-NEXT: v_writelane_b32 v23, s27, 15 -; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 12 ; SI-NEXT: v_writelane_b32 v23, s27, 13 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 10 ; SI-NEXT: v_writelane_b32 v23, s27, 11 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 8 ; SI-NEXT: v_writelane_b32 v23, s27, 9 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 6 ; SI-NEXT: v_writelane_b32 v23, s27, 7 -; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 16 ; SI-NEXT: v_writelane_b32 v23, s26, 4 ; SI-NEXT: v_writelane_b32 v23, s27, 5 -; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 8 ; SI-NEXT: v_writelane_b32 v23, s26, 2 ; SI-NEXT: v_writelane_b32 v23, s27, 3 -; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 24 ; SI-NEXT: v_writelane_b32 v23, s26, 0 +; SI-NEXT: s_lshr_b32 s65, s5, 24 +; SI-NEXT: s_lshr_b32 s51, s45, 24 +; SI-NEXT: s_lshr_b32 s52, s45, 16 +; SI-NEXT: s_lshr_b32 s53, s45, 8 ; SI-NEXT: s_lshr_b32 s49, s47, 24 -; SI-NEXT: s_lshr_b32 s48, s47, 16 -; SI-NEXT: s_lshr_b32 s50, s47, 8 -; SI-NEXT: s_lshr_b32 s51, s57, 24 -; SI-NEXT: s_lshr_b32 s52, s57, 16 -; SI-NEXT: s_lshr_b32 s53, s57, 8 -; SI-NEXT: s_lshr_b64 s[54:55], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s54, s47, 16 +; SI-NEXT: s_lshr_b32 s55, s47, 8 +; SI-NEXT: s_lshr_b32 s48, s59, 24 +; SI-NEXT: s_lshr_b32 s50, s59, 16 +; SI-NEXT: s_lshr_b32 s64, s59, 8 ; SI-NEXT: v_writelane_b32 v23, s27, 1 -; SI-NEXT: s_lshr_b64 s[64:65], s[18:19], 24 ; SI-NEXT: s_lshr_b64 s[66:67], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[68:69], s[18:19], 8 ; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 24 @@ -82412,7 +82066,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[98:99], s[24:25], 24 ; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 ; SI-NEXT: s_lshr_b64 s[28:29], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[58:59], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[56:57], s[40:41], 24 ; SI-NEXT: s_lshr_b64 s[60:61], s[40:41], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 8 ; SI-NEXT: s_lshr_b64 s[72:73], s[42:43], 24 @@ -82424,255 +82078,259 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[92:93], s[46:47], 24 ; SI-NEXT: s_lshr_b64 s[94:95], s[46:47], 16 ; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[56:57], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[56:57], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[58:59], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[58:59], 8 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: s_lshl_b32 s27, s38, 8 -; SI-NEXT: s_and_b32 s29, s56, 0xff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s58, 0xff +; SI-NEXT: s_lshl_b32 s29, s38, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: s_and_b32 s29, s36, 0xff -; SI-NEXT: s_lshl_b32 s56, s34, 24 ; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: s_or_b32 s29, s56, s29 +; SI-NEXT: s_lshl_b32 s57, s34, 24 +; SI-NEXT: s_or_b32 s29, s57, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v1, s27 -; SI-NEXT: s_and_b32 s27, s57, 0xff -; SI-NEXT: s_lshl_b32 s29, s53, 8 +; SI-NEXT: s_and_b32 s27, s59, 0xff +; SI-NEXT: s_lshl_b32 s29, s64, 8 ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: s_and_b32 s29, s52, 0xff +; SI-NEXT: s_and_b32 s29, s50, 0xff ; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: s_lshl_b32 s56, s51, 24 -; SI-NEXT: s_or_b32 s29, s56, s29 +; SI-NEXT: s_lshl_b32 s57, s48, 24 +; SI-NEXT: s_or_b32 s29, s57, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s30, 8 -; SI-NEXT: s_and_b32 s29, s46, 0xff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s46, 0xff +; SI-NEXT: s_lshl_b32 s29, s30, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: s_and_b32 s29, s94, 0xff -; SI-NEXT: s_lshl_b32 s46, s92, 24 ; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_lshl_b32 s46, s92, 24 ; SI-NEXT: s_or_b32 s29, s46, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v3, s27 ; SI-NEXT: s_and_b32 s27, s47, 0xff -; SI-NEXT: s_lshl_b32 s29, s50, 8 +; SI-NEXT: s_lshl_b32 s29, s55, 8 ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: s_and_b32 s29, s48, 0xff +; SI-NEXT: s_and_b32 s29, s54, 0xff ; SI-NEXT: s_lshl_b32 s29, s29, 16 ; SI-NEXT: s_lshl_b32 s46, s49, 24 ; SI-NEXT: s_or_b32 s29, s46, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: s_lshl_b32 s27, s90, 8 -; SI-NEXT: s_and_b32 s29, s44, 0xff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s44, 0xff +; SI-NEXT: s_lshl_b32 s29, s90, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: s_and_b32 s29, s88, 0xff -; SI-NEXT: s_lshl_b32 s44, s78, 24 ; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_lshl_b32 s44, s78, 24 ; SI-NEXT: s_or_b32 s29, s44, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_readlane_b32 s29, v22, 17 ; SI-NEXT: v_mov_b32_e32 v5, s27 ; SI-NEXT: s_and_b32 s27, s45, 0xff -; SI-NEXT: s_lshl_b32 s29, s29, 8 +; SI-NEXT: s_lshl_b32 s29, s53, 8 ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_readlane_b32 s29, v22, 16 -; SI-NEXT: s_and_b32 s29, s29, 0xff -; SI-NEXT: v_readlane_b32 s44, v22, 15 +; SI-NEXT: s_and_b32 s29, s52, 0xff ; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: s_lshl_b32 s44, s44, 24 +; SI-NEXT: s_lshl_b32 s44, s51, 24 ; SI-NEXT: s_or_b32 s29, s44, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v6, s27 -; SI-NEXT: s_lshl_b32 s27, s76, 8 -; SI-NEXT: s_and_b32 s29, s42, 0xff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s42, 0xff +; SI-NEXT: s_lshl_b32 s29, s76, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: s_and_b32 s29, s74, 0xff -; SI-NEXT: s_lshl_b32 s42, s72, 24 ; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_lshl_b32 s42, s72, 24 ; SI-NEXT: s_or_b32 s29, s42, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_readlane_b32 s29, v22, 14 +; SI-NEXT: v_readlane_b32 s29, v22, 17 ; SI-NEXT: v_mov_b32_e32 v7, s27 ; SI-NEXT: s_and_b32 s27, s43, 0xff ; SI-NEXT: s_lshl_b32 s29, s29, 8 ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_readlane_b32 s29, v22, 13 +; SI-NEXT: v_readlane_b32 s29, v22, 16 ; SI-NEXT: s_and_b32 s29, s29, 0xff -; SI-NEXT: v_readlane_b32 s42, v22, 12 +; SI-NEXT: v_readlane_b32 s42, v22, 15 ; SI-NEXT: s_lshl_b32 s29, s29, 16 ; SI-NEXT: s_lshl_b32 s42, s42, 24 ; SI-NEXT: s_or_b32 s29, s42, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v8, s27 -; SI-NEXT: s_lshl_b32 s27, s62, 8 -; SI-NEXT: s_and_b32 s29, s40, 0xff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s40, 0xff +; SI-NEXT: s_lshl_b32 s29, s62, 8 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: s_and_b32 s29, s60, 0xff -; SI-NEXT: s_lshl_b32 s40, s58, 24 ; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: s_lshl_b32 s40, s56, 24 ; SI-NEXT: s_or_b32 s29, s40, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_readlane_b32 s29, v22, 11 +; SI-NEXT: v_readlane_b32 s29, v22, 14 ; SI-NEXT: v_mov_b32_e32 v9, s27 ; SI-NEXT: s_and_b32 s27, s41, 0xff ; SI-NEXT: s_lshl_b32 s29, s29, 8 ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_readlane_b32 s29, v22, 10 +; SI-NEXT: v_readlane_b32 s29, v22, 13 ; SI-NEXT: s_and_b32 s29, s29, 0xff -; SI-NEXT: v_readlane_b32 s40, v22, 9 +; SI-NEXT: v_readlane_b32 s40, v22, 12 ; SI-NEXT: s_lshl_b32 s29, s29, 16 ; SI-NEXT: s_lshl_b32 s40, s40, 24 ; SI-NEXT: s_or_b32 s29, s40, s29 ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v10, s27 -; SI-NEXT: s_lshl_b32 s27, s28, 8 ; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_lshl_b32 s27, s28, 8 ; SI-NEXT: s_and_b32 s26, s26, 0xff ; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: s_lshl_b32 s27, s98, 24 ; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_lshl_b32 s27, s98, 24 ; SI-NEXT: s_or_b32 s26, s27, s26 ; SI-NEXT: s_and_b32 s24, s24, 0xffff ; SI-NEXT: s_or_b32 s24, s24, s26 ; SI-NEXT: v_mov_b32_e32 v11, s24 ; SI-NEXT: s_and_b32 s24, s25, 0xff -; SI-NEXT: v_readlane_b32 s25, v22, 8 +; SI-NEXT: v_readlane_b32 s25, v22, 11 ; SI-NEXT: s_lshl_b32 s25, s25, 8 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: v_readlane_b32 s25, v22, 7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: v_readlane_b32 s25, v22, 10 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_and_b32 s25, s25, 0xff -; SI-NEXT: v_readlane_b32 s26, v22, 6 +; SI-NEXT: v_readlane_b32 s26, v22, 9 +; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 ; SI-NEXT: s_lshl_b32 s25, s25, 16 ; SI-NEXT: s_lshl_b32 s26, s26, 24 -; SI-NEXT: s_or_b32 s25, s26, s25 +; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 ; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s25, s26, s25 +; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 ; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: s_lshl_b32 s24, s96, 8 +; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v2, s24 ; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_lshl_b32 s24, s96, 8 +; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 ; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: s_and_b32 s24, s86, 0xff -; SI-NEXT: s_lshl_b32 s25, s84, 24 +; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 ; SI-NEXT: s_lshl_b32 s24, s24, 16 -; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: s_lshl_b32 s25, s84, 24 +; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 ; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 ; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s22 ; SI-NEXT: s_and_b32 s22, s23, 0xff -; SI-NEXT: v_readlane_b32 s23, v22, 5 +; SI-NEXT: v_readlane_b32 s23, v22, 8 ; SI-NEXT: s_lshl_b32 s23, s23, 8 ; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: v_readlane_b32 s23, v22, 4 +; SI-NEXT: v_readlane_b32 s23, v22, 7 ; SI-NEXT: s_and_b32 s23, s23, 0xff -; SI-NEXT: v_readlane_b32 s24, v22, 3 +; SI-NEXT: v_readlane_b32 s24, v22, 6 ; SI-NEXT: s_lshl_b32 s23, s23, 16 ; SI-NEXT: s_lshl_b32 s24, s24, 24 -; SI-NEXT: s_or_b32 s23, s24, s23 ; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s23, s24, s23 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 ; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: s_lshl_b32 s22, s82, 8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s22 ; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s22, s82, 8 ; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: s_and_b32 s22, s80, 0xff -; SI-NEXT: s_lshl_b32 s23, s70, 24 ; SI-NEXT: s_lshl_b32 s22, s22, 16 -; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: s_lshl_b32 s23, s70, 24 ; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 ; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s20 ; SI-NEXT: s_and_b32 s20, s21, 0xff -; SI-NEXT: v_readlane_b32 s21, v22, 2 +; SI-NEXT: v_readlane_b32 s21, v22, 5 ; SI-NEXT: s_lshl_b32 s21, s21, 8 ; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: v_readlane_b32 s21, v22, 1 +; SI-NEXT: v_readlane_b32 s21, v22, 4 ; SI-NEXT: s_and_b32 s21, s21, 0xff -; SI-NEXT: v_readlane_b32 s22, v22, 0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_readlane_b32 s22, v22, 3 ; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_lshl_b32 s22, s22, 24 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s21, s22, s21 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 ; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: s_lshl_b32 s20, s68, 8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s20 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_lshl_b32 s20, s68, 8 ; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: s_and_b32 s20, s66, 0xff -; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_lshl_b32 s21, s64, 24 +; SI-NEXT: v_readlane_b32 s22, v23, 0 ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_lshl_b32 s21, s22, 24 ; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: s_and_b32 s18, s19, 0xff -; SI-NEXT: v_readlane_b32 s19, v23, 63 -; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: v_readlane_b32 s19, v22, 2 ; SI-NEXT: s_lshl_b32 s19, s19, 8 -; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 ; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: v_readlane_b32 s19, v23, 62 -; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: v_readlane_b32 s19, v22, 1 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: v_readlane_b32 s20, v23, 61 -; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: v_readlane_b32 s20, v22, 0 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s20, 24 -; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: v_readlane_b32 s18, v23, 0 +; SI-NEXT: v_readlane_b32 s18, v23, 2 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 8 -; SI-NEXT: v_readlane_b32 s19, v23, 1 -; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: v_readlane_b32 s18, v23, 2 ; SI-NEXT: v_readlane_b32 s19, v23, 3 +; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: v_readlane_b32 s18, v23, 4 +; SI-NEXT: v_readlane_b32 s19, v23, 5 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: v_readlane_b32 s20, v23, 4 +; SI-NEXT: v_readlane_b32 s20, v23, 6 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s20, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff @@ -82683,12 +82341,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v23, 60 +; SI-NEXT: v_readlane_b32 s17, v23, 63 ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v23, 59 +; SI-NEXT: v_readlane_b32 s17, v23, 62 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v23, 58 +; SI-NEXT: v_readlane_b32 s18, v23, 61 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff @@ -82698,15 +82356,15 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: v_readlane_b32 s16, v23, 6 +; SI-NEXT: v_readlane_b32 s16, v23, 8 ; SI-NEXT: s_and_b32 s14, s14, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s17, v23, 7 -; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: v_readlane_b32 s16, v23, 8 ; SI-NEXT: v_readlane_b32 s17, v23, 9 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: v_readlane_b32 s16, v23, 10 +; SI-NEXT: v_readlane_b32 s17, v23, 11 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s18, v23, 10 +; SI-NEXT: v_readlane_b32 s18, v23, 12 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s18, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff @@ -82717,12 +82375,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xff -; SI-NEXT: v_readlane_b32 s15, v23, 57 +; SI-NEXT: v_readlane_b32 s15, v23, 60 ; SI-NEXT: s_lshl_b32 s15, s15, 8 ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_readlane_b32 s15, v23, 56 +; SI-NEXT: v_readlane_b32 s15, v23, 59 ; SI-NEXT: s_and_b32 s15, s15, 0xff -; SI-NEXT: v_readlane_b32 s16, v23, 55 +; SI-NEXT: v_readlane_b32 s16, v23, 58 ; SI-NEXT: s_lshl_b32 s15, s15, 16 ; SI-NEXT: s_lshl_b32 s16, s16, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff @@ -82732,15 +82390,15 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: v_readlane_b32 s14, v23, 12 +; SI-NEXT: v_readlane_b32 s14, v23, 14 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 8 -; SI-NEXT: v_readlane_b32 s15, v23, 13 -; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: v_readlane_b32 s14, v23, 14 ; SI-NEXT: v_readlane_b32 s15, v23, 15 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: v_readlane_b32 s14, v23, 16 +; SI-NEXT: v_readlane_b32 s15, v23, 17 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s16, v23, 16 +; SI-NEXT: v_readlane_b32 s16, v23, 18 ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_lshl_b32 s15, s16, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff @@ -82751,12 +82409,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff -; SI-NEXT: v_readlane_b32 s13, v23, 54 +; SI-NEXT: v_readlane_b32 s13, v23, 57 ; SI-NEXT: s_lshl_b32 s13, s13, 8 ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: v_readlane_b32 s13, v23, 53 +; SI-NEXT: v_readlane_b32 s13, v23, 56 ; SI-NEXT: s_and_b32 s13, s13, 0xff -; SI-NEXT: v_readlane_b32 s14, v23, 52 +; SI-NEXT: v_readlane_b32 s14, v23, 55 ; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_lshl_b32 s14, s14, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff @@ -82766,15 +82424,15 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: v_readlane_b32 s12, v23, 18 +; SI-NEXT: v_readlane_b32 s12, v23, 20 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 8 -; SI-NEXT: v_readlane_b32 s13, v23, 19 -; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: v_readlane_b32 s12, v23, 20 ; SI-NEXT: v_readlane_b32 s13, v23, 21 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: v_readlane_b32 s12, v23, 22 +; SI-NEXT: v_readlane_b32 s13, v23, 23 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s14, v23, 22 +; SI-NEXT: v_readlane_b32 s14, v23, 24 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_lshl_b32 s13, s14, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff @@ -82785,12 +82443,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v23, 51 +; SI-NEXT: v_readlane_b32 s11, v23, 54 ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_readlane_b32 s11, v23, 50 +; SI-NEXT: v_readlane_b32 s11, v23, 53 ; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: v_readlane_b32 s12, v23, 49 +; SI-NEXT: v_readlane_b32 s12, v23, 52 ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_lshl_b32 s12, s12, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff @@ -82800,15 +82458,15 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_readlane_b32 s10, v23, 24 +; SI-NEXT: v_readlane_b32 s10, v23, 26 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: v_readlane_b32 s11, v23, 25 -; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: v_readlane_b32 s10, v23, 26 ; SI-NEXT: v_readlane_b32 s11, v23, 27 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: v_readlane_b32 s10, v23, 28 +; SI-NEXT: v_readlane_b32 s11, v23, 29 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s12, v23, 28 +; SI-NEXT: v_readlane_b32 s12, v23, 30 ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_lshl_b32 s11, s12, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff @@ -82819,12 +82477,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v23, 48 +; SI-NEXT: v_readlane_b32 s9, v23, 51 ; SI-NEXT: s_lshl_b32 s9, s9, 8 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_readlane_b32 s9, v23, 47 +; SI-NEXT: v_readlane_b32 s9, v23, 50 ; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: v_readlane_b32 s10, v23, 46 +; SI-NEXT: v_readlane_b32 s10, v23, 49 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_lshl_b32 s10, s10, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff @@ -82834,15 +82492,15 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_readlane_b32 s8, v23, 30 +; SI-NEXT: v_readlane_b32 s8, v23, 32 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v23, 31 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: v_readlane_b32 s8, v23, 32 ; SI-NEXT: v_readlane_b32 s9, v23, 33 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: v_readlane_b32 s8, v23, 34 +; SI-NEXT: v_readlane_b32 s9, v23, 35 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s10, v23, 34 +; SI-NEXT: v_readlane_b32 s10, v23, 36 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_lshl_b32 s9, s10, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff @@ -82853,12 +82511,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v23, 45 +; SI-NEXT: v_readlane_b32 s7, v23, 48 ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_readlane_b32 s7, v23, 44 +; SI-NEXT: v_readlane_b32 s7, v23, 47 ; SI-NEXT: s_and_b32 s7, s7, 0xff -; SI-NEXT: v_readlane_b32 s8, v23, 43 +; SI-NEXT: v_readlane_b32 s8, v23, 46 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s8, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff @@ -82868,13 +82526,15 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_readlane_b32 s6, v23, 36 +; SI-NEXT: v_readlane_b32 s6, v23, 38 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: v_readlane_b32 s7, v23, 37 +; SI-NEXT: v_readlane_b32 s7, v23, 39 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s54, 0xff -; SI-NEXT: v_readlane_b32 s8, v23, 38 +; SI-NEXT: v_readlane_b32 s6, v23, 40 +; SI-NEXT: v_readlane_b32 s7, v23, 41 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: v_readlane_b32 s8, v23, 42 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s8, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -82885,14 +82545,13 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: v_readlane_b32 s5, v23, 42 +; SI-NEXT: v_readlane_b32 s5, v23, 45 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v23, 41 +; SI-NEXT: v_readlane_b32 s5, v23, 44 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_readlane_b32 s6, v23, 40 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_lshl_b32 s6, s65, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 @@ -82900,13 +82559,14 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s21, v23, 5 -; SI-NEXT: v_readlane_b32 s19, v23, 11 -; SI-NEXT: v_readlane_b32 s17, v23, 17 -; SI-NEXT: v_readlane_b32 s15, v23, 23 -; SI-NEXT: v_readlane_b32 s13, v23, 29 -; SI-NEXT: v_readlane_b32 s11, v23, 35 -; SI-NEXT: v_readlane_b32 s9, v23, 39 +; SI-NEXT: v_readlane_b32 s23, v23, 1 +; SI-NEXT: v_readlane_b32 s21, v23, 7 +; SI-NEXT: v_readlane_b32 s19, v23, 13 +; SI-NEXT: v_readlane_b32 s17, v23, 19 +; SI-NEXT: v_readlane_b32 s15, v23, 25 +; SI-NEXT: v_readlane_b32 s13, v23, 31 +; SI-NEXT: v_readlane_b32 s11, v23, 37 +; SI-NEXT: v_readlane_b32 s9, v23, 43 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s99, v21, 35 ; SI-NEXT: v_readlane_b32 s98, v21, 34 @@ -82952,170 +82612,172 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 0 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 1 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 2 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 3 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 4 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 5 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 6 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 7 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 8 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 9 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 10 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 11 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 12 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 13 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 14 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 15 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 16 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 17 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 18 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 19 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 20 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 21 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 22 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 23 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 24 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 25 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 27 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 28 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 29 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 30 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 31 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 32 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 33 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 34 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 35 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 36 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 37 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 38 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v23, s54, 0 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 1 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 2 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 3 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 4 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 5 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 6 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 7 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 8 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 9 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 10 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 11 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 12 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 13 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 14 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 15 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 16 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 17 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 18 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 19 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 20 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 21 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 22 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 23 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 24 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 25 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 27 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 28 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 29 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 30 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 31 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 32 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 33 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 34 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 35 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 39 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 36 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 40 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 37 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 41 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v23, vcc_lo, 42 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 38 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: v_writelane_b32 v23, vcc_hi, 43 +; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr86 @@ -83125,12 +82787,10 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: v_writelane_b32 v23, s55, 39 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v16i64_to_v128i8_scalar: @@ -83221,166 +82881,329 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s5, v18 ; VI-NEXT: v_writelane_b32 v21, s87, 31 ; VI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; VI-NEXT: s_cbranch_scc0 .LBB57_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b32 s26, s5, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 8 -; VI-NEXT: s_lshr_b32 s26, s5, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 9 -; VI-NEXT: s_lshr_b32 s26, s5, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 10 -; VI-NEXT: s_lshr_b32 s26, s4, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 11 -; VI-NEXT: s_lshr_b32 s26, s4, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 12 -; VI-NEXT: s_lshr_b32 s26, s7, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 13 -; VI-NEXT: s_lshr_b32 s26, s7, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 14 -; VI-NEXT: s_lshr_b32 s26, s7, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 15 -; VI-NEXT: s_lshr_b32 s26, s6, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 16 -; VI-NEXT: s_lshr_b32 s26, s6, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 17 -; VI-NEXT: s_lshr_b32 s26, s9, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 18 -; VI-NEXT: s_lshr_b32 s26, s9, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 19 -; VI-NEXT: s_lshr_b32 s26, s9, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 20 -; VI-NEXT: s_lshr_b32 s26, s8, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 21 -; VI-NEXT: s_lshr_b32 s26, s8, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 22 -; VI-NEXT: s_lshr_b32 s26, s11, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 23 -; VI-NEXT: s_lshr_b32 s26, s11, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 24 -; VI-NEXT: s_lshr_b32 s26, s11, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 25 -; VI-NEXT: s_lshr_b32 s26, s10, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 26 -; VI-NEXT: s_lshr_b32 s26, s10, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 27 -; VI-NEXT: s_lshr_b32 s26, s13, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 28 -; VI-NEXT: s_lshr_b32 s26, s13, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 29 -; VI-NEXT: s_lshr_b32 s26, s13, 8 +; VI-NEXT: s_lshr_b32 s26, s5, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 30 -; VI-NEXT: s_lshr_b32 s26, s12, 16 +; VI-NEXT: s_lshr_b32 s26, s5, 8 ; VI-NEXT: v_writelane_b32 v22, s26, 31 -; VI-NEXT: s_lshr_b32 s26, s12, 8 +; VI-NEXT: s_lshr_b32 s26, s4, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 32 -; VI-NEXT: s_lshr_b32 s26, s15, 24 +; VI-NEXT: s_lshr_b32 s26, s4, 8 ; VI-NEXT: v_writelane_b32 v22, s26, 33 -; VI-NEXT: s_lshr_b32 s26, s15, 16 +; VI-NEXT: s_lshr_b32 s26, s7, 24 ; VI-NEXT: v_writelane_b32 v22, s26, 34 -; VI-NEXT: s_lshr_b32 s26, s15, 8 +; VI-NEXT: s_lshr_b32 s26, s12, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 35 -; VI-NEXT: s_lshr_b32 s26, s14, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 36 -; VI-NEXT: s_lshr_b32 s26, s14, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 37 -; VI-NEXT: s_lshr_b32 s26, s17, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 38 -; VI-NEXT: s_lshr_b32 s26, s17, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 39 -; VI-NEXT: s_lshr_b32 s26, s17, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 40 -; VI-NEXT: s_lshr_b32 s26, s16, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 41 -; VI-NEXT: s_lshr_b32 s26, s16, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 42 ; VI-NEXT: s_lshr_b32 s26, s19, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 43 -; VI-NEXT: s_lshr_b32 s26, s19, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 44 -; VI-NEXT: s_lshr_b32 s26, s19, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 45 -; VI-NEXT: s_lshr_b32 s26, s18, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 46 -; VI-NEXT: s_lshr_b32 s26, s18, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 47 -; VI-NEXT: s_lshr_b32 s26, s21, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 48 -; VI-NEXT: s_lshr_b32 s26, s21, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 49 -; VI-NEXT: s_lshr_b32 s26, s21, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 50 -; VI-NEXT: s_lshr_b32 s26, s20, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 51 -; VI-NEXT: s_lshr_b32 s26, s20, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 52 -; VI-NEXT: s_lshr_b32 s26, s23, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 53 -; VI-NEXT: s_lshr_b32 s26, s23, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 54 -; VI-NEXT: s_lshr_b32 s26, s23, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 55 -; VI-NEXT: s_lshr_b32 s26, s22, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 56 -; VI-NEXT: s_lshr_b32 s26, s22, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 57 -; VI-NEXT: s_lshr_b32 s26, s25, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 58 -; VI-NEXT: s_lshr_b32 s26, s25, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 59 -; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 -; VI-NEXT: v_writelane_b32 v22, s60, 6 -; VI-NEXT: v_writelane_b32 v22, s61, 7 -; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 -; VI-NEXT: v_writelane_b32 v22, s60, 4 -; VI-NEXT: v_writelane_b32 v22, s61, 5 -; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 -; VI-NEXT: v_writelane_b32 v22, s60, 2 -; VI-NEXT: v_writelane_b32 v22, s61, 3 -; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; VI-NEXT: v_writelane_b32 v22, s60, 0 -; VI-NEXT: s_lshr_b32 s66, s25, 8 -; VI-NEXT: s_lshr_b32 s67, s24, 16 -; VI-NEXT: s_lshr_b32 s68, s24, 8 -; VI-NEXT: s_lshr_b32 s69, s41, 24 -; VI-NEXT: s_lshr_b32 s70, s41, 16 -; VI-NEXT: s_lshr_b32 s71, s41, 8 -; VI-NEXT: s_lshr_b32 s80, s40, 16 -; VI-NEXT: s_lshr_b32 s81, s40, 8 -; VI-NEXT: s_lshr_b32 s82, s43, 24 -; VI-NEXT: s_lshr_b32 s83, s43, 16 -; VI-NEXT: s_lshr_b32 s84, s43, 8 -; VI-NEXT: s_lshr_b32 s85, s42, 16 -; VI-NEXT: s_lshr_b32 s86, s42, 8 -; VI-NEXT: s_lshr_b32 s87, s45, 24 -; VI-NEXT: s_lshr_b32 s50, s45, 16 -; VI-NEXT: s_lshr_b32 s26, s45, 8 +; VI-NEXT: v_writelane_b32 v22, s26, 36 +; VI-NEXT: s_lshr_b32 s27, s40, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 37 +; VI-NEXT: s_lshr_b32 s27, s40, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 38 +; VI-NEXT: s_lshr_b32 s27, s43, 24 +; VI-NEXT: v_writelane_b32 v22, s27, 39 +; VI-NEXT: s_lshr_b32 s27, s43, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 40 +; VI-NEXT: s_lshr_b32 s27, s43, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 41 +; VI-NEXT: s_lshr_b32 s27, s42, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 42 +; VI-NEXT: s_lshr_b32 s27, s42, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 43 +; VI-NEXT: s_lshr_b32 s27, s45, 24 +; VI-NEXT: v_writelane_b32 v22, s27, 44 +; VI-NEXT: s_lshr_b32 s27, s45, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 45 +; VI-NEXT: s_lshr_b32 s27, s45, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 46 ; VI-NEXT: s_lshr_b32 s27, s44, 16 -; VI-NEXT: s_lshr_b32 s28, s44, 8 -; VI-NEXT: s_lshr_b32 s29, s47, 24 -; VI-NEXT: s_lshr_b32 s51, s47, 16 -; VI-NEXT: s_lshr_b32 s52, s47, 8 -; VI-NEXT: s_lshr_b32 s53, s46, 16 -; VI-NEXT: s_lshr_b32 s54, s46, 8 -; VI-NEXT: s_lshr_b32 s58, s57, 24 -; VI-NEXT: s_lshr_b32 s59, s57, 16 -; VI-NEXT: s_lshr_b32 s55, s57, 8 -; VI-NEXT: s_lshr_b32 s64, s56, 16 -; VI-NEXT: s_lshr_b32 s65, s56, 8 -; VI-NEXT: v_writelane_b32 v22, s61, 1 -; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; VI-NEXT: v_writelane_b32 v22, s27, 47 +; VI-NEXT: s_lshr_b32 s27, s44, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 48 +; VI-NEXT: s_lshr_b32 s27, s47, 24 +; VI-NEXT: v_writelane_b32 v22, s27, 49 +; VI-NEXT: s_lshr_b32 s27, s47, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 50 +; VI-NEXT: s_lshr_b32 s27, s47, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 51 +; VI-NEXT: s_lshr_b32 s27, s46, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 52 +; VI-NEXT: s_lshr_b32 s27, s46, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 53 +; VI-NEXT: s_lshr_b32 s27, s57, 24 +; VI-NEXT: v_writelane_b32 v22, s27, 54 +; VI-NEXT: s_lshr_b32 s27, s57, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 55 +; VI-NEXT: s_lshr_b32 s27, s57, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 56 +; VI-NEXT: s_lshr_b32 s27, s56, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 28 +; VI-NEXT: s_lshr_b64 s[90:91], s[4:5], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 26 +; VI-NEXT: v_writelane_b32 v22, s91, 27 +; VI-NEXT: s_lshr_b64 s[90:91], s[6:7], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 24 +; VI-NEXT: v_writelane_b32 v22, s91, 25 +; VI-NEXT: s_lshr_b64 s[90:91], s[8:9], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 22 +; VI-NEXT: v_writelane_b32 v22, s91, 23 +; VI-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 20 +; VI-NEXT: v_writelane_b32 v22, s91, 21 +; VI-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 18 +; VI-NEXT: v_writelane_b32 v22, s91, 19 +; VI-NEXT: s_lshr_b64 s[90:91], s[14:15], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 16 +; VI-NEXT: v_writelane_b32 v22, s91, 17 +; VI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 14 +; VI-NEXT: v_writelane_b32 v22, s91, 15 +; VI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 12 +; VI-NEXT: v_writelane_b32 v22, s91, 13 +; VI-NEXT: s_lshr_b64 s[90:91], s[20:21], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 10 +; VI-NEXT: v_writelane_b32 v22, s91, 11 +; VI-NEXT: s_lshr_b64 s[90:91], s[22:23], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 8 +; VI-NEXT: v_writelane_b32 v22, s91, 9 ; VI-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[46:47], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[56:57], 24 -; VI-NEXT: s_cbranch_execnz .LBB57_3 -; VI-NEXT: .LBB57_2: ; %cmp.true +; VI-NEXT: v_writelane_b32 v22, s90, 6 +; VI-NEXT: v_writelane_b32 v22, s91, 7 +; VI-NEXT: s_lshr_b64 s[90:91], s[40:41], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 4 +; VI-NEXT: v_writelane_b32 v22, s91, 5 +; VI-NEXT: s_lshr_b64 s[90:91], s[42:43], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 2 +; VI-NEXT: v_writelane_b32 v22, s91, 3 +; VI-NEXT: s_lshr_b64 s[90:91], s[44:45], 24 +; VI-NEXT: v_writelane_b32 v22, s90, 0 +; VI-NEXT: s_lshr_b32 s48, s7, 16 +; VI-NEXT: s_lshr_b32 s78, s6, 16 +; VI-NEXT: v_writelane_b32 v22, s91, 1 +; VI-NEXT: s_lshr_b64 s[90:91], s[46:47], 24 +; VI-NEXT: s_lshr_b32 s49, s7, 8 +; VI-NEXT: s_lshr_b32 s84, s6, 8 +; VI-NEXT: s_lshr_b32 s28, s9, 24 +; VI-NEXT: s_lshr_b32 s85, s9, 16 +; VI-NEXT: s_lshr_b32 s86, s9, 8 +; VI-NEXT: s_lshr_b32 s61, s8, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 8 +; VI-NEXT: s_lshr_b32 s88, s11, 24 +; VI-NEXT: s_lshr_b32 s89, s11, 16 +; VI-NEXT: s_lshr_b32 s87, s11, 8 +; VI-NEXT: s_lshr_b32 s50, s10, 16 +; VI-NEXT: s_lshr_b32 s51, s10, 8 +; VI-NEXT: s_lshr_b32 s58, s13, 24 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s52, s13, 8 +; VI-NEXT: s_lshr_b32 s30, s12, 8 +; VI-NEXT: s_lshr_b32 s31, s15, 24 +; VI-NEXT: s_lshr_b32 s53, s15, 16 +; VI-NEXT: s_lshr_b32 s54, s15, 8 +; VI-NEXT: s_lshr_b32 s55, s14, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 8 +; VI-NEXT: s_lshr_b32 s34, s17, 24 +; VI-NEXT: s_lshr_b32 s35, s17, 16 +; VI-NEXT: s_lshr_b32 s64, s17, 8 +; VI-NEXT: s_lshr_b32 s65, s16, 16 +; VI-NEXT: s_lshr_b32 s37, s16, 8 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s62, s19, 8 +; VI-NEXT: s_lshr_b32 s63, s18, 16 +; VI-NEXT: s_lshr_b32 s66, s18, 8 +; VI-NEXT: s_lshr_b32 s38, s21, 24 +; VI-NEXT: s_lshr_b32 s39, s21, 16 +; VI-NEXT: s_lshr_b32 s67, s21, 8 +; VI-NEXT: s_lshr_b32 s68, s20, 16 +; VI-NEXT: s_lshr_b32 s69, s20, 8 +; VI-NEXT: s_lshr_b32 s72, s23, 24 +; VI-NEXT: s_lshr_b32 s73, s23, 16 +; VI-NEXT: s_lshr_b32 s70, s23, 8 +; VI-NEXT: s_lshr_b32 s71, s22, 16 +; VI-NEXT: s_lshr_b32 s80, s22, 8 +; VI-NEXT: s_lshr_b32 s74, s25, 24 +; VI-NEXT: s_lshr_b32 s75, s25, 16 +; VI-NEXT: s_lshr_b32 s81, s25, 8 +; VI-NEXT: s_lshr_b32 s82, s24, 16 +; VI-NEXT: s_lshr_b32 s83, s24, 8 +; VI-NEXT: s_lshr_b32 s76, s41, 24 +; VI-NEXT: s_lshr_b32 s77, s41, 16 +; VI-NEXT: s_lshr_b32 s26, s41, 8 +; VI-NEXT: s_lshr_b32 s27, s56, 8 +; VI-NEXT: s_mov_b32 s91, s48 +; VI-NEXT: s_mov_b32 s48, s78 +; VI-NEXT: s_lshr_b64 s[78:79], s[56:57], 24 +; VI-NEXT: s_mov_b64 vcc, 0 +; VI-NEXT: s_branch .LBB57_3 +; VI-NEXT: .LBB57_2: +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $vcc_lo +; VI-NEXT: ; kill: killed $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; kill: killed $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; kill: killed $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; kill: killed $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; kill: killed $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; kill: killed $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: v_writelane_b32 v22, s26, 0 +; VI-NEXT: v_writelane_b32 v22, s27, 1 +; VI-NEXT: v_writelane_b32 v22, s28, 2 +; VI-NEXT: v_writelane_b32 v22, s29, 3 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 4 +; VI-NEXT: v_writelane_b32 v22, s29, 5 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 6 +; VI-NEXT: v_writelane_b32 v22, s29, 7 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 8 +; VI-NEXT: v_writelane_b32 v22, s29, 9 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 10 +; VI-NEXT: v_writelane_b32 v22, s29, 11 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 12 +; VI-NEXT: v_writelane_b32 v22, s29, 13 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; kill: killed $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 14 +; VI-NEXT: v_writelane_b32 v22, s29, 15 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 16 +; VI-NEXT: v_writelane_b32 v22, s29, 17 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; kill: killed $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 18 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: v_writelane_b32 v22, s29, 19 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 20 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: v_writelane_b32 v22, s29, 21 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: v_writelane_b32 v22, s28, 22 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: v_writelane_b32 v22, s29, 23 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 24 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 25 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $vcc_lo +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 26 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 27 +; VI-NEXT: s_mov_b64 vcc, -1 +; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: .LBB57_3: ; %Flow +; VI-NEXT: s_andn2_b64 vcc, exec, vcc +; VI-NEXT: s_mov_b32 s79, s84 +; VI-NEXT: s_mov_b32 s84, s28 +; VI-NEXT: s_mov_b32 s28, s61 +; VI-NEXT: s_mov_b32 s61, s26 +; VI-NEXT: v_readlane_b32 s26, v22, 28 +; VI-NEXT: s_cbranch_vccnz .LBB57_5 +; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: s_add_u32 s56, s56, 3 ; VI-NEXT: s_addc_u32 s57, s57, 0 ; VI-NEXT: s_add_u32 s46, s46, 3 @@ -83413,385 +83236,392 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_addc_u32 s7, s7, 0 ; VI-NEXT: s_add_u32 s4, s4, 3 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: s_lshr_b32 s26, s5, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 8 -; VI-NEXT: s_lshr_b32 s26, s5, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 9 -; VI-NEXT: s_lshr_b32 s26, s5, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 10 -; VI-NEXT: s_lshr_b32 s26, s4, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 11 -; VI-NEXT: s_lshr_b32 s26, s4, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 12 -; VI-NEXT: s_lshr_b32 s26, s7, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 13 -; VI-NEXT: s_lshr_b32 s26, s7, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 14 -; VI-NEXT: s_lshr_b32 s26, s7, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 15 -; VI-NEXT: s_lshr_b32 s26, s6, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 16 -; VI-NEXT: s_lshr_b32 s26, s6, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 17 -; VI-NEXT: s_lshr_b32 s26, s9, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 18 -; VI-NEXT: s_lshr_b32 s26, s9, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 19 -; VI-NEXT: s_lshr_b32 s26, s9, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 20 -; VI-NEXT: s_lshr_b32 s26, s8, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 21 -; VI-NEXT: s_lshr_b32 s26, s8, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 22 -; VI-NEXT: s_lshr_b32 s26, s11, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 23 -; VI-NEXT: s_lshr_b32 s26, s11, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 24 -; VI-NEXT: s_lshr_b32 s26, s11, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 25 -; VI-NEXT: s_lshr_b32 s26, s10, 16 +; VI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 ; VI-NEXT: v_writelane_b32 v22, s26, 26 -; VI-NEXT: s_lshr_b32 s26, s10, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 27 -; VI-NEXT: s_lshr_b32 s26, s13, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 28 -; VI-NEXT: s_lshr_b32 s26, s13, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 27 +; VI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; VI-NEXT: v_writelane_b32 v22, s26, 24 +; VI-NEXT: v_writelane_b32 v22, s27, 25 +; VI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 +; VI-NEXT: v_writelane_b32 v22, s26, 22 +; VI-NEXT: v_writelane_b32 v22, s27, 23 +; VI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; VI-NEXT: v_writelane_b32 v22, s26, 20 +; VI-NEXT: v_writelane_b32 v22, s27, 21 +; VI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; VI-NEXT: v_writelane_b32 v22, s26, 18 +; VI-NEXT: v_writelane_b32 v22, s27, 19 +; VI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 +; VI-NEXT: v_writelane_b32 v22, s26, 16 +; VI-NEXT: v_writelane_b32 v22, s27, 17 +; VI-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 +; VI-NEXT: v_writelane_b32 v22, s26, 14 +; VI-NEXT: v_writelane_b32 v22, s27, 15 +; VI-NEXT: s_lshr_b64 s[26:27], s[18:19], 24 +; VI-NEXT: v_writelane_b32 v22, s26, 12 +; VI-NEXT: v_writelane_b32 v22, s27, 13 +; VI-NEXT: s_lshr_b64 s[26:27], s[20:21], 24 +; VI-NEXT: v_writelane_b32 v22, s26, 10 +; VI-NEXT: v_writelane_b32 v22, s27, 11 +; VI-NEXT: s_lshr_b64 s[26:27], s[22:23], 24 +; VI-NEXT: v_writelane_b32 v22, s26, 8 +; VI-NEXT: v_writelane_b32 v22, s27, 9 +; VI-NEXT: s_lshr_b64 s[26:27], s[24:25], 24 +; VI-NEXT: v_writelane_b32 v22, s26, 6 +; VI-NEXT: v_writelane_b32 v22, s27, 7 +; VI-NEXT: s_lshr_b64 s[26:27], s[40:41], 24 +; VI-NEXT: v_writelane_b32 v22, s26, 4 +; VI-NEXT: v_writelane_b32 v22, s27, 5 +; VI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 +; VI-NEXT: v_writelane_b32 v22, s26, 2 +; VI-NEXT: v_writelane_b32 v22, s27, 3 +; VI-NEXT: s_lshr_b64 s[26:27], s[44:45], 24 +; VI-NEXT: v_writelane_b32 v22, s26, 0 +; VI-NEXT: v_writelane_b32 v22, s27, 1 +; VI-NEXT: s_lshr_b32 s26, s5, 24 ; VI-NEXT: v_writelane_b32 v22, s26, 29 -; VI-NEXT: s_lshr_b32 s26, s13, 8 +; VI-NEXT: s_lshr_b32 s26, s5, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 30 -; VI-NEXT: s_lshr_b32 s26, s12, 16 +; VI-NEXT: s_lshr_b32 s26, s5, 8 ; VI-NEXT: v_writelane_b32 v22, s26, 31 -; VI-NEXT: s_lshr_b32 s26, s12, 8 +; VI-NEXT: s_lshr_b32 s26, s4, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 32 -; VI-NEXT: s_lshr_b32 s26, s15, 24 +; VI-NEXT: s_lshr_b32 s26, s4, 8 ; VI-NEXT: v_writelane_b32 v22, s26, 33 -; VI-NEXT: s_lshr_b32 s26, s15, 16 +; VI-NEXT: s_lshr_b32 s26, s7, 24 ; VI-NEXT: v_writelane_b32 v22, s26, 34 -; VI-NEXT: s_lshr_b32 s26, s15, 8 +; VI-NEXT: s_lshr_b32 s26, s12, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 35 -; VI-NEXT: s_lshr_b32 s26, s14, 16 +; VI-NEXT: s_lshr_b32 s26, s19, 24 ; VI-NEXT: v_writelane_b32 v22, s26, 36 -; VI-NEXT: s_lshr_b32 s26, s14, 8 +; VI-NEXT: s_lshr_b32 s26, s40, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 37 -; VI-NEXT: s_lshr_b32 s26, s17, 24 +; VI-NEXT: s_lshr_b32 s26, s40, 8 ; VI-NEXT: v_writelane_b32 v22, s26, 38 -; VI-NEXT: s_lshr_b32 s26, s17, 16 +; VI-NEXT: s_lshr_b32 s26, s43, 24 ; VI-NEXT: v_writelane_b32 v22, s26, 39 -; VI-NEXT: s_lshr_b32 s26, s17, 8 +; VI-NEXT: s_lshr_b32 s26, s43, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 40 -; VI-NEXT: s_lshr_b32 s26, s16, 16 +; VI-NEXT: s_lshr_b32 s26, s43, 8 ; VI-NEXT: v_writelane_b32 v22, s26, 41 -; VI-NEXT: s_lshr_b32 s26, s16, 8 +; VI-NEXT: s_lshr_b32 s26, s42, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 42 -; VI-NEXT: s_lshr_b32 s26, s19, 24 +; VI-NEXT: s_lshr_b32 s26, s42, 8 ; VI-NEXT: v_writelane_b32 v22, s26, 43 -; VI-NEXT: s_lshr_b32 s26, s19, 16 +; VI-NEXT: s_lshr_b32 s26, s45, 24 ; VI-NEXT: v_writelane_b32 v22, s26, 44 -; VI-NEXT: s_lshr_b32 s26, s19, 8 +; VI-NEXT: s_lshr_b32 s26, s45, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 45 -; VI-NEXT: s_lshr_b32 s26, s18, 16 +; VI-NEXT: s_lshr_b32 s26, s45, 8 ; VI-NEXT: v_writelane_b32 v22, s26, 46 -; VI-NEXT: s_lshr_b32 s26, s18, 8 +; VI-NEXT: s_lshr_b32 s26, s44, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 47 -; VI-NEXT: s_lshr_b32 s26, s21, 24 +; VI-NEXT: s_lshr_b32 s26, s44, 8 ; VI-NEXT: v_writelane_b32 v22, s26, 48 -; VI-NEXT: s_lshr_b32 s26, s21, 16 +; VI-NEXT: s_lshr_b32 s26, s47, 24 ; VI-NEXT: v_writelane_b32 v22, s26, 49 -; VI-NEXT: s_lshr_b32 s26, s21, 8 +; VI-NEXT: s_lshr_b32 s26, s47, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 50 -; VI-NEXT: s_lshr_b32 s26, s20, 16 +; VI-NEXT: s_lshr_b32 s26, s47, 8 ; VI-NEXT: v_writelane_b32 v22, s26, 51 -; VI-NEXT: s_lshr_b32 s26, s20, 8 +; VI-NEXT: s_lshr_b32 s26, s46, 16 ; VI-NEXT: v_writelane_b32 v22, s26, 52 -; VI-NEXT: s_lshr_b32 s26, s23, 24 +; VI-NEXT: s_lshr_b32 s26, s46, 8 ; VI-NEXT: v_writelane_b32 v22, s26, 53 -; VI-NEXT: s_lshr_b32 s26, s23, 16 +; VI-NEXT: s_lshr_b32 s26, s57, 24 ; VI-NEXT: v_writelane_b32 v22, s26, 54 -; VI-NEXT: s_lshr_b32 s26, s23, 8 +; VI-NEXT: s_lshr_b32 s26, s57, 16 +; VI-NEXT: s_lshr_b64 s[90:91], s[46:47], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[56:57], 24 ; VI-NEXT: v_writelane_b32 v22, s26, 55 -; VI-NEXT: s_lshr_b32 s26, s22, 16 +; VI-NEXT: s_lshr_b32 s26, s57, 8 +; VI-NEXT: s_lshr_b32 s91, s7, 16 +; VI-NEXT: s_lshr_b32 s49, s7, 8 +; VI-NEXT: s_lshr_b32 s48, s6, 16 +; VI-NEXT: s_lshr_b32 s79, s6, 8 +; VI-NEXT: s_lshr_b32 s84, s9, 24 +; VI-NEXT: s_lshr_b32 s85, s9, 16 +; VI-NEXT: s_lshr_b32 s86, s9, 8 +; VI-NEXT: s_lshr_b32 s28, s8, 16 +; VI-NEXT: s_lshr_b32 s29, s8, 8 +; VI-NEXT: s_lshr_b32 s88, s11, 24 +; VI-NEXT: s_lshr_b32 s89, s11, 16 +; VI-NEXT: s_lshr_b32 s87, s11, 8 +; VI-NEXT: s_lshr_b32 s50, s10, 16 +; VI-NEXT: s_lshr_b32 s51, s10, 8 +; VI-NEXT: s_lshr_b32 s58, s13, 24 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: s_lshr_b32 s52, s13, 8 +; VI-NEXT: s_lshr_b32 s30, s12, 8 +; VI-NEXT: s_lshr_b32 s31, s15, 24 +; VI-NEXT: s_lshr_b32 s53, s15, 16 +; VI-NEXT: s_lshr_b32 s54, s15, 8 +; VI-NEXT: s_lshr_b32 s55, s14, 16 +; VI-NEXT: s_lshr_b32 s60, s14, 8 +; VI-NEXT: s_lshr_b32 s34, s17, 24 +; VI-NEXT: s_lshr_b32 s35, s17, 16 +; VI-NEXT: s_lshr_b32 s64, s17, 8 +; VI-NEXT: s_lshr_b32 s65, s16, 16 +; VI-NEXT: s_lshr_b32 s37, s16, 8 +; VI-NEXT: s_lshr_b32 s36, s19, 16 +; VI-NEXT: s_lshr_b32 s62, s19, 8 +; VI-NEXT: s_lshr_b32 s63, s18, 16 +; VI-NEXT: s_lshr_b32 s66, s18, 8 +; VI-NEXT: s_lshr_b32 s38, s21, 24 +; VI-NEXT: s_lshr_b32 s39, s21, 16 +; VI-NEXT: s_lshr_b32 s67, s21, 8 +; VI-NEXT: s_lshr_b32 s68, s20, 16 +; VI-NEXT: s_lshr_b32 s69, s20, 8 +; VI-NEXT: s_lshr_b32 s72, s23, 24 +; VI-NEXT: s_lshr_b32 s73, s23, 16 +; VI-NEXT: s_lshr_b32 s70, s23, 8 +; VI-NEXT: s_lshr_b32 s71, s22, 16 +; VI-NEXT: s_lshr_b32 s80, s22, 8 +; VI-NEXT: s_lshr_b32 s74, s25, 24 +; VI-NEXT: s_lshr_b32 s75, s25, 16 +; VI-NEXT: s_lshr_b32 s81, s25, 8 +; VI-NEXT: s_lshr_b32 s82, s24, 16 +; VI-NEXT: s_lshr_b32 s83, s24, 8 +; VI-NEXT: s_lshr_b32 s76, s41, 24 +; VI-NEXT: s_lshr_b32 s77, s41, 16 +; VI-NEXT: s_lshr_b32 s61, s41, 8 ; VI-NEXT: v_writelane_b32 v22, s26, 56 -; VI-NEXT: s_lshr_b32 s26, s22, 8 -; VI-NEXT: v_writelane_b32 v22, s26, 57 -; VI-NEXT: s_lshr_b32 s26, s25, 24 -; VI-NEXT: v_writelane_b32 v22, s26, 58 -; VI-NEXT: s_lshr_b32 s26, s25, 16 -; VI-NEXT: v_writelane_b32 v22, s26, 59 -; VI-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 -; VI-NEXT: v_writelane_b32 v22, s60, 6 -; VI-NEXT: v_writelane_b32 v22, s61, 7 -; VI-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 -; VI-NEXT: v_writelane_b32 v22, s60, 4 -; VI-NEXT: v_writelane_b32 v22, s61, 5 -; VI-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 -; VI-NEXT: v_writelane_b32 v22, s60, 2 -; VI-NEXT: v_writelane_b32 v22, s61, 3 -; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; VI-NEXT: s_lshr_b32 s66, s25, 8 -; VI-NEXT: s_lshr_b32 s67, s24, 16 -; VI-NEXT: s_lshr_b32 s68, s24, 8 -; VI-NEXT: s_lshr_b32 s69, s41, 24 -; VI-NEXT: s_lshr_b32 s70, s41, 16 -; VI-NEXT: s_lshr_b32 s71, s41, 8 -; VI-NEXT: s_lshr_b32 s80, s40, 16 -; VI-NEXT: s_lshr_b32 s81, s40, 8 -; VI-NEXT: s_lshr_b32 s82, s43, 24 -; VI-NEXT: s_lshr_b32 s83, s43, 16 -; VI-NEXT: s_lshr_b32 s84, s43, 8 -; VI-NEXT: s_lshr_b32 s85, s42, 16 -; VI-NEXT: s_lshr_b32 s86, s42, 8 -; VI-NEXT: s_lshr_b32 s87, s45, 24 -; VI-NEXT: s_lshr_b32 s50, s45, 16 -; VI-NEXT: s_lshr_b32 s26, s45, 8 -; VI-NEXT: s_lshr_b32 s27, s44, 16 -; VI-NEXT: s_lshr_b32 s28, s44, 8 -; VI-NEXT: s_lshr_b32 s29, s47, 24 -; VI-NEXT: s_lshr_b32 s51, s47, 16 -; VI-NEXT: s_lshr_b32 s52, s47, 8 -; VI-NEXT: s_lshr_b32 s53, s46, 16 -; VI-NEXT: s_lshr_b32 s54, s46, 8 -; VI-NEXT: s_lshr_b32 s58, s57, 24 -; VI-NEXT: s_lshr_b32 s59, s57, 16 -; VI-NEXT: s_lshr_b32 s55, s57, 8 -; VI-NEXT: s_lshr_b32 s64, s56, 16 -; VI-NEXT: s_lshr_b32 s65, s56, 8 -; VI-NEXT: v_writelane_b32 v22, s60, 0 -; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[46:47], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[56:57], 24 -; VI-NEXT: v_writelane_b32 v22, s61, 1 -; VI-NEXT: .LBB57_3: ; %end -; VI-NEXT: s_lshl_b32 s61, s65, 8 +; VI-NEXT: s_lshr_b32 s26, s56, 16 +; VI-NEXT: s_lshr_b32 s27, s56, 8 +; VI-NEXT: .LBB57_5: ; %end ; VI-NEXT: s_and_b32 s56, s56, 0xff -; VI-NEXT: s_or_b32 s56, s56, s61 -; VI-NEXT: s_lshl_b32 s61, s48, 8 -; VI-NEXT: s_and_b32 s63, s64, 0xff -; VI-NEXT: s_or_b32 s61, s63, s61 -; VI-NEXT: s_and_b32 s56, s56, 0xffff -; VI-NEXT: s_lshl_b32 s61, s61, 16 -; VI-NEXT: s_or_b32 s56, s56, s61 -; VI-NEXT: v_mov_b32_e32 v1, s56 -; VI-NEXT: s_and_b32 s56, s57, 0xff -; VI-NEXT: s_lshl_b32 s57, s55, 8 -; VI-NEXT: s_or_b32 s56, s56, s57 -; VI-NEXT: s_and_b32 s57, s59, 0xff -; VI-NEXT: s_lshl_b32 s58, s58, 8 -; VI-NEXT: s_or_b32 s57, s57, s58 -; VI-NEXT: s_and_b32 s56, s56, 0xffff -; VI-NEXT: s_lshl_b32 s57, s57, 16 -; VI-NEXT: s_or_b32 s56, s56, s57 -; VI-NEXT: v_mov_b32_e32 v2, s56 -; VI-NEXT: s_lshl_b32 s56, s54, 8 -; VI-NEXT: s_and_b32 s46, s46, 0xff -; VI-NEXT: s_or_b32 s46, s46, s56 -; VI-NEXT: s_lshl_b32 s56, s38, 8 -; VI-NEXT: s_and_b32 s57, s53, 0xff -; VI-NEXT: s_or_b32 s56, s57, s56 -; VI-NEXT: s_and_b32 s46, s46, 0xffff -; VI-NEXT: s_lshl_b32 s56, s56, 16 -; VI-NEXT: s_or_b32 s46, s46, s56 -; VI-NEXT: v_mov_b32_e32 v3, s46 -; VI-NEXT: s_and_b32 s46, s47, 0xff -; VI-NEXT: s_lshl_b32 s47, s52, 8 -; VI-NEXT: s_or_b32 s46, s46, s47 -; VI-NEXT: s_and_b32 s47, s51, 0xff -; VI-NEXT: s_lshl_b32 s29, s29, 8 -; VI-NEXT: s_or_b32 s29, s47, s29 -; VI-NEXT: s_and_b32 s46, s46, 0xffff -; VI-NEXT: s_lshl_b32 s29, s29, 16 -; VI-NEXT: s_or_b32 s29, s46, s29 -; VI-NEXT: v_mov_b32_e32 v4, s29 -; VI-NEXT: s_lshl_b32 s28, s28, 8 -; VI-NEXT: s_and_b32 s29, s44, 0xff -; VI-NEXT: s_or_b32 s28, s29, s28 -; VI-NEXT: s_lshl_b32 s29, s36, 8 +; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_or_b32 s27, s56, s27 +; VI-NEXT: s_and_b32 s26, s26, 0xff +; VI-NEXT: s_lshl_b32 s56, s78, 8 +; VI-NEXT: s_or_b32 s26, s26, s56 +; VI-NEXT: s_and_b32 s27, s27, 0xffff +; VI-NEXT: s_lshl_b32 s26, s26, 16 +; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: v_readlane_b32 s27, v22, 56 +; VI-NEXT: v_mov_b32_e32 v1, s26 +; VI-NEXT: s_and_b32 s26, s57, 0xff +; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 55 +; VI-NEXT: v_readlane_b32 s56, v22, 54 ; VI-NEXT: s_and_b32 s27, s27, 0xff -; VI-NEXT: s_or_b32 s27, s27, s29 -; VI-NEXT: s_and_b32 s28, s28, 0xffff +; VI-NEXT: s_lshl_b32 s56, s56, 8 +; VI-NEXT: s_or_b32 s27, s27, s56 +; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_lshl_b32 s27, s27, 16 -; VI-NEXT: s_or_b32 s27, s28, s27 -; VI-NEXT: v_mov_b32_e32 v5, s27 -; VI-NEXT: s_and_b32 s27, s45, 0xff -; VI-NEXT: s_lshl_b32 s26, s26, 8 -; VI-NEXT: s_or_b32 s26, s27, s26 -; VI-NEXT: s_and_b32 s27, s50, 0xff -; VI-NEXT: s_lshl_b32 s28, s87, 8 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 53 +; VI-NEXT: v_mov_b32_e32 v2, s26 +; VI-NEXT: s_and_b32 s26, s46, 0xff +; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 52 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_lshl_b32 s46, s90, 8 +; VI-NEXT: s_or_b32 s27, s27, s46 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 51 +; VI-NEXT: v_mov_b32_e32 v3, s26 +; VI-NEXT: s_and_b32 s26, s47, 0xff +; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 50 +; VI-NEXT: v_readlane_b32 s46, v22, 49 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_lshl_b32 s46, s46, 8 +; VI-NEXT: s_or_b32 s27, s27, s46 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 48 +; VI-NEXT: v_mov_b32_e32 v4, s26 +; VI-NEXT: s_and_b32 s26, s44, 0xff +; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 47 +; VI-NEXT: v_readlane_b32 s46, v22, 0 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_lshl_b32 s44, s46, 8 +; VI-NEXT: s_or_b32 s27, s27, s44 +; VI-NEXT: s_and_b32 s26, s26, 0xffff +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 46 +; VI-NEXT: v_mov_b32_e32 v5, s26 +; VI-NEXT: s_and_b32 s26, s45, 0xff +; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 45 +; VI-NEXT: v_readlane_b32 s44, v22, 44 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_lshl_b32 s44, s44, 8 +; VI-NEXT: s_or_b32 s27, s27, s44 ; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_lshl_b32 s27, s27, 16 ; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 43 ; VI-NEXT: v_mov_b32_e32 v6, s26 -; VI-NEXT: s_lshl_b32 s26, s86, 8 -; VI-NEXT: s_and_b32 s27, s42, 0xff -; VI-NEXT: s_or_b32 s26, s27, s26 -; VI-NEXT: s_lshl_b32 s27, s34, 8 -; VI-NEXT: s_and_b32 s28, s85, 0xff -; VI-NEXT: s_or_b32 s27, s28, s27 +; VI-NEXT: s_and_b32 s26, s42, 0xff +; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 42 +; VI-NEXT: v_readlane_b32 s44, v22, 2 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_lshl_b32 s42, s44, 8 +; VI-NEXT: s_or_b32 s27, s27, s42 ; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_lshl_b32 s27, s27, 16 ; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 41 ; VI-NEXT: v_mov_b32_e32 v7, s26 ; VI-NEXT: s_and_b32 s26, s43, 0xff -; VI-NEXT: s_lshl_b32 s27, s84, 8 +; VI-NEXT: s_lshl_b32 s27, s27, 8 ; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, s83, 0xff -; VI-NEXT: s_lshl_b32 s28, s82, 8 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: v_readlane_b32 s27, v22, 40 +; VI-NEXT: v_readlane_b32 s42, v22, 39 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_lshl_b32 s42, s42, 8 +; VI-NEXT: s_or_b32 s27, s27, s42 ; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_lshl_b32 s27, s27, 16 ; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 38 ; VI-NEXT: v_mov_b32_e32 v8, s26 -; VI-NEXT: s_lshl_b32 s26, s81, 8 -; VI-NEXT: s_and_b32 s27, s40, 0xff -; VI-NEXT: s_or_b32 s26, s27, s26 -; VI-NEXT: s_lshl_b32 s27, s30, 8 -; VI-NEXT: s_and_b32 s28, s80, 0xff -; VI-NEXT: s_or_b32 s27, s28, s27 +; VI-NEXT: s_and_b32 s26, s40, 0xff +; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 +; VI-NEXT: v_readlane_b32 s27, v22, 37 +; VI-NEXT: v_readlane_b32 s42, v22, 4 +; VI-NEXT: s_and_b32 s27, s27, 0xff +; VI-NEXT: s_lshl_b32 s40, s42, 8 +; VI-NEXT: s_or_b32 s27, s27, s40 ; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_lshl_b32 s27, s27, 16 ; VI-NEXT: s_or_b32 s26, s26, s27 ; VI-NEXT: v_mov_b32_e32 v9, s26 ; VI-NEXT: s_and_b32 s26, s41, 0xff -; VI-NEXT: s_lshl_b32 s27, s71, 8 +; VI-NEXT: s_lshl_b32 s27, s61, 8 ; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: s_and_b32 s27, s70, 0xff -; VI-NEXT: s_lshl_b32 s28, s69, 8 -; VI-NEXT: s_or_b32 s27, s27, s28 +; VI-NEXT: s_and_b32 s27, s77, 0xff +; VI-NEXT: s_lshl_b32 s40, s76, 8 +; VI-NEXT: s_or_b32 s27, s27, s40 ; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_lshl_b32 s27, s27, 16 ; VI-NEXT: s_or_b32 s26, s26, s27 ; VI-NEXT: v_mov_b32_e32 v10, s26 -; VI-NEXT: s_lshl_b32 s26, s68, 8 ; VI-NEXT: s_and_b32 s24, s24, 0xff +; VI-NEXT: s_lshl_b32 s26, s83, 8 +; VI-NEXT: v_readlane_b32 s40, v22, 6 ; VI-NEXT: s_or_b32 s24, s24, s26 -; VI-NEXT: s_lshl_b32 s26, s90, 8 -; VI-NEXT: s_and_b32 s27, s67, 0xff -; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: s_and_b32 s26, s82, 0xff +; VI-NEXT: s_lshl_b32 s27, s40, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 ; VI-NEXT: s_and_b32 s24, s24, 0xffff ; VI-NEXT: s_lshl_b32 s26, s26, 16 ; VI-NEXT: s_or_b32 s24, s24, s26 ; VI-NEXT: v_mov_b32_e32 v11, s24 ; VI-NEXT: s_and_b32 s24, s25, 0xff -; VI-NEXT: s_lshl_b32 s25, s66, 8 +; VI-NEXT: s_lshl_b32 s25, s81, 8 ; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: v_readlane_b32 s25, v22, 59 -; VI-NEXT: v_readlane_b32 s26, v22, 58 -; VI-NEXT: s_and_b32 s25, s25, 0xff -; VI-NEXT: s_lshl_b32 s26, s26, 8 +; VI-NEXT: s_and_b32 s25, s75, 0xff +; VI-NEXT: s_lshl_b32 s26, s74, 8 ; VI-NEXT: s_or_b32 s25, s25, s26 ; VI-NEXT: s_and_b32 s24, s24, 0xffff ; VI-NEXT: s_lshl_b32 s25, s25, 16 ; VI-NEXT: s_or_b32 s24, s24, s25 ; VI-NEXT: v_mov_b32_e32 v12, s24 -; VI-NEXT: v_readlane_b32 s24, v22, 57 -; VI-NEXT: s_lshl_b32 s24, s24, 8 ; VI-NEXT: s_and_b32 s22, s22, 0xff -; VI-NEXT: v_readlane_b32 s25, v22, 56 -; VI-NEXT: s_or_b32 s22, s22, s24 -; VI-NEXT: s_lshl_b32 s24, s88, 8 -; VI-NEXT: s_and_b32 s25, s25, 0xff -; VI-NEXT: s_or_b32 s24, s25, s24 -; VI-NEXT: s_and_b32 s22, s22, 0xffff -; VI-NEXT: s_lshl_b32 s24, s24, 16 +; VI-NEXT: s_lshl_b32 s24, s80, 8 +; VI-NEXT: v_readlane_b32 s26, v22, 8 ; VI-NEXT: s_or_b32 s22, s22, s24 -; VI-NEXT: v_mov_b32_e32 v13, s22 -; VI-NEXT: s_and_b32 s22, s23, 0xff -; VI-NEXT: v_readlane_b32 s23, v22, 55 -; VI-NEXT: s_lshl_b32 s23, s23, 8 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: v_readlane_b32 s23, v22, 54 -; VI-NEXT: v_readlane_b32 s24, v22, 53 -; VI-NEXT: s_and_b32 s23, s23, 0xff -; VI-NEXT: s_lshl_b32 s24, s24, 8 -; VI-NEXT: s_or_b32 s23, s23, s24 -; VI-NEXT: s_and_b32 s22, s22, 0xffff -; VI-NEXT: s_lshl_b32 s23, s23, 16 -; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: v_mov_b32_e32 v14, s22 -; VI-NEXT: v_readlane_b32 s22, v22, 52 -; VI-NEXT: s_lshl_b32 s22, s22, 8 -; VI-NEXT: s_and_b32 s20, s20, 0xff -; VI-NEXT: v_readlane_b32 s23, v22, 51 -; VI-NEXT: s_or_b32 s20, s20, s22 -; VI-NEXT: s_lshl_b32 s22, s78, 8 -; VI-NEXT: s_and_b32 s23, s23, 0xff -; VI-NEXT: s_or_b32 s22, s23, s22 -; VI-NEXT: s_and_b32 s20, s20, 0xffff -; VI-NEXT: s_lshl_b32 s22, s22, 16 -; VI-NEXT: s_or_b32 s20, s20, s22 +; VI-NEXT: s_and_b32 s24, s71, 0xff +; VI-NEXT: s_lshl_b32 s25, s26, 8 +; VI-NEXT: s_or_b32 s24, s24, s25 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 -; VI-NEXT: v_mov_b32_e32 v15, s20 -; VI-NEXT: s_and_b32 s20, s21, 0xff -; VI-NEXT: v_readlane_b32 s21, v22, 50 +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_lshl_b32 s24, s24, 16 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 -; VI-NEXT: s_lshl_b32 s21, s21, 8 +; VI-NEXT: s_or_b32 s22, s22, s24 ; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 -; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: s_and_b32 s22, s23, 0xff +; VI-NEXT: s_lshl_b32 s23, s70, 8 ; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; VI-NEXT: v_readlane_b32 s21, v22, 49 -; VI-NEXT: v_readlane_b32 s22, v22, 48 +; VI-NEXT: s_or_b32 s22, s22, s23 ; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; VI-NEXT: s_and_b32 s21, s21, 0xff -; VI-NEXT: s_lshl_b32 s22, s22, 8 +; VI-NEXT: s_and_b32 s23, s73, 0xff +; VI-NEXT: s_lshl_b32 s24, s72, 8 ; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_or_b32 s23, s23, s24 ; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 -; VI-NEXT: s_and_b32 s20, s20, 0xffff -; VI-NEXT: s_lshl_b32 s21, s21, 16 +; VI-NEXT: s_and_b32 s22, s22, 0xffff +; VI-NEXT: s_lshl_b32 s23, s23, 16 ; VI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: s_or_b32 s22, s22, s23 ; VI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; VI-NEXT: v_mov_b32_e32 v2, s20 -; VI-NEXT: v_readlane_b32 s20, v22, 47 +; VI-NEXT: v_mov_b32_e32 v2, s22 +; VI-NEXT: s_and_b32 s20, s20, 0xff +; VI-NEXT: s_lshl_b32 s22, s69, 8 +; VI-NEXT: v_readlane_b32 s24, v22, 10 ; VI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 -; VI-NEXT: s_and_b32 s18, s18, 0xff -; VI-NEXT: s_lshl_b32 s20, s20, 8 +; VI-NEXT: s_or_b32 s20, s20, s22 +; VI-NEXT: s_and_b32 s22, s68, 0xff +; VI-NEXT: s_lshl_b32 s23, s24, 8 ; VI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 -; VI-NEXT: s_or_b32 s18, s18, s20 -; VI-NEXT: v_readlane_b32 s20, v22, 46 +; VI-NEXT: s_or_b32 s22, s22, s23 ; VI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 -; VI-NEXT: s_and_b32 s20, s20, 0xff -; VI-NEXT: s_lshl_b32 s21, s76, 8 +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_lshl_b32 s22, s22, 16 ; VI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s20, s20, s22 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s20 +; VI-NEXT: s_and_b32 s20, s21, 0xff +; VI-NEXT: s_lshl_b32 s21, s67, 8 ; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; VI-NEXT: s_and_b32 s21, s39, 0xff +; VI-NEXT: s_lshl_b32 s22, s38, 8 +; VI-NEXT: s_or_b32 s21, s21, s22 +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_lshl_b32 s21, s21, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_or_b32 s20, s20, s21 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s20 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_lshl_b32 s20, s66, 8 +; VI-NEXT: v_readlane_b32 s22, v22, 12 +; VI-NEXT: s_or_b32 s18, s18, s20 +; VI-NEXT: s_and_b32 s20, s63, 0xff +; VI-NEXT: s_lshl_b32 s21, s22, 8 +; VI-NEXT: s_or_b32 s20, s20, s21 ; VI-NEXT: s_and_b32 s18, s18, 0xffff ; VI-NEXT: s_lshl_b32 s20, s20, 16 -; VI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v0 ; VI-NEXT: s_or_b32 s18, s18, s20 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: s_and_b32 s18, s19, 0xff -; VI-NEXT: v_readlane_b32 s19, v22, 45 -; VI-NEXT: s_lshl_b32 s19, s19, 8 +; VI-NEXT: s_lshl_b32 s19, s62, 8 +; VI-NEXT: v_readlane_b32 s20, v22, 36 ; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: v_readlane_b32 s19, v22, 44 -; VI-NEXT: v_readlane_b32 s20, v22, 43 -; VI-NEXT: s_and_b32 s19, s19, 0xff +; VI-NEXT: s_and_b32 s19, s36, 0xff ; VI-NEXT: s_lshl_b32 s20, s20, 8 ; VI-NEXT: s_or_b32 s19, s19, s20 ; VI-NEXT: s_and_b32 s18, s18, 0xffff @@ -83800,13 +83630,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_or_b32 s18, s18, s19 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_readlane_b32 s18, v22, 42 ; VI-NEXT: s_and_b32 s16, s16, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_lshl_b32 s18, s37, 8 +; VI-NEXT: v_readlane_b32 s20, v22, 14 ; VI-NEXT: s_or_b32 s16, s16, s18 -; VI-NEXT: v_readlane_b32 s18, v22, 41 -; VI-NEXT: s_and_b32 s18, s18, 0xff -; VI-NEXT: s_lshl_b32 s19, s74, 8 +; VI-NEXT: s_and_b32 s18, s65, 0xff +; VI-NEXT: s_lshl_b32 s19, s20, 8 ; VI-NEXT: s_or_b32 s18, s18, s19 ; VI-NEXT: s_and_b32 s16, s16, 0xffff ; VI-NEXT: s_lshl_b32 s18, s18, 16 @@ -83815,13 +83644,10 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s16 ; VI-NEXT: s_and_b32 s16, s17, 0xff -; VI-NEXT: v_readlane_b32 s17, v22, 40 -; VI-NEXT: s_lshl_b32 s17, s17, 8 +; VI-NEXT: s_lshl_b32 s17, s64, 8 ; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v22, 39 -; VI-NEXT: v_readlane_b32 s18, v22, 38 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_and_b32 s17, s35, 0xff +; VI-NEXT: s_lshl_b32 s18, s34, 8 ; VI-NEXT: s_or_b32 s17, s17, s18 ; VI-NEXT: s_and_b32 s16, s16, 0xffff ; VI-NEXT: s_lshl_b32 s17, s17, 16 @@ -83829,13 +83655,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_readlane_b32 s16, v22, 37 ; VI-NEXT: s_and_b32 s14, s14, 0xff -; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_lshl_b32 s16, s60, 8 +; VI-NEXT: v_readlane_b32 s18, v22, 16 ; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: v_readlane_b32 s16, v22, 36 -; VI-NEXT: s_and_b32 s16, s16, 0xff -; VI-NEXT: s_lshl_b32 s17, s72, 8 +; VI-NEXT: s_and_b32 s16, s55, 0xff +; VI-NEXT: s_lshl_b32 s17, s18, 8 ; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: s_and_b32 s14, s14, 0xffff ; VI-NEXT: s_lshl_b32 s16, s16, 16 @@ -83844,13 +83669,10 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: s_and_b32 s14, s15, 0xff -; VI-NEXT: v_readlane_b32 s15, v22, 35 -; VI-NEXT: s_lshl_b32 s15, s15, 8 +; VI-NEXT: s_lshl_b32 s15, s54, 8 ; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: v_readlane_b32 s15, v22, 34 -; VI-NEXT: v_readlane_b32 s16, v22, 33 -; VI-NEXT: s_and_b32 s15, s15, 0xff -; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s15, s53, 0xff +; VI-NEXT: s_lshl_b32 s16, s31, 8 ; VI-NEXT: s_or_b32 s15, s15, s16 ; VI-NEXT: s_and_b32 s14, s14, 0xffff ; VI-NEXT: s_lshl_b32 s15, s15, 16 @@ -83858,13 +83680,13 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_or_b32 s14, s14, s15 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_readlane_b32 s14, v22, 32 ; VI-NEXT: s_and_b32 s12, s12, 0xff -; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_lshl_b32 s14, s30, 8 ; VI-NEXT: s_or_b32 s12, s12, s14 -; VI-NEXT: v_readlane_b32 s14, v22, 31 +; VI-NEXT: v_readlane_b32 s14, v22, 35 +; VI-NEXT: v_readlane_b32 s16, v22, 18 ; VI-NEXT: s_and_b32 s14, s14, 0xff -; VI-NEXT: s_lshl_b32 s15, s62, 8 +; VI-NEXT: s_lshl_b32 s15, s16, 8 ; VI-NEXT: s_or_b32 s14, s14, s15 ; VI-NEXT: s_and_b32 s12, s12, 0xffff ; VI-NEXT: s_lshl_b32 s14, s14, 16 @@ -83873,13 +83695,10 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s12 ; VI-NEXT: s_and_b32 s12, s13, 0xff -; VI-NEXT: v_readlane_b32 s13, v22, 30 -; VI-NEXT: s_lshl_b32 s13, s13, 8 +; VI-NEXT: s_lshl_b32 s13, s52, 8 ; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: v_readlane_b32 s13, v22, 29 -; VI-NEXT: v_readlane_b32 s14, v22, 28 -; VI-NEXT: s_and_b32 s13, s13, 0xff -; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_and_b32 s13, s59, 0xff +; VI-NEXT: s_lshl_b32 s14, s58, 8 ; VI-NEXT: s_or_b32 s13, s13, s14 ; VI-NEXT: s_and_b32 s12, s12, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 @@ -83887,13 +83706,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s12 -; VI-NEXT: v_readlane_b32 s12, v22, 27 ; VI-NEXT: s_and_b32 s10, s10, 0xff -; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: s_lshl_b32 s12, s51, 8 +; VI-NEXT: v_readlane_b32 s14, v22, 20 ; VI-NEXT: s_or_b32 s10, s10, s12 -; VI-NEXT: v_readlane_b32 s12, v22, 26 -; VI-NEXT: v_readlane_b32 s14, v22, 0 -; VI-NEXT: s_and_b32 s12, s12, 0xff +; VI-NEXT: s_and_b32 s12, s50, 0xff ; VI-NEXT: s_lshl_b32 s13, s14, 8 ; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: s_and_b32 s10, s10, 0xffff @@ -83903,13 +83720,10 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: s_and_b32 s10, s11, 0xff -; VI-NEXT: v_readlane_b32 s11, v22, 25 -; VI-NEXT: s_lshl_b32 s11, s11, 8 +; VI-NEXT: s_lshl_b32 s11, s87, 8 ; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: v_readlane_b32 s11, v22, 24 -; VI-NEXT: v_readlane_b32 s12, v22, 23 -; VI-NEXT: s_and_b32 s11, s11, 0xff -; VI-NEXT: s_lshl_b32 s12, s12, 8 +; VI-NEXT: s_and_b32 s11, s89, 0xff +; VI-NEXT: s_lshl_b32 s12, s88, 8 ; VI-NEXT: s_or_b32 s11, s11, s12 ; VI-NEXT: s_and_b32 s10, s10, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 @@ -83917,13 +83731,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_readlane_b32 s10, v22, 22 ; VI-NEXT: s_and_b32 s8, s8, 0xff -; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_lshl_b32 s10, s29, 8 +; VI-NEXT: v_readlane_b32 s12, v22, 22 ; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: v_readlane_b32 s10, v22, 21 -; VI-NEXT: v_readlane_b32 s12, v22, 2 -; VI-NEXT: s_and_b32 s10, s10, 0xff +; VI-NEXT: s_and_b32 s10, s28, 0xff ; VI-NEXT: s_lshl_b32 s11, s12, 8 ; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: s_and_b32 s8, s8, 0xffff @@ -83933,13 +83745,10 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: s_and_b32 s8, s9, 0xff -; VI-NEXT: v_readlane_b32 s9, v22, 20 -; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_lshl_b32 s9, s86, 8 ; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: v_readlane_b32 s9, v22, 19 -; VI-NEXT: v_readlane_b32 s10, v22, 18 -; VI-NEXT: s_and_b32 s9, s9, 0xff -; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_and_b32 s9, s85, 0xff +; VI-NEXT: s_lshl_b32 s10, s84, 8 ; VI-NEXT: s_or_b32 s9, s9, s10 ; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 @@ -83947,13 +83756,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_readlane_b32 s8, v22, 17 ; VI-NEXT: s_and_b32 s6, s6, 0xff -; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_lshl_b32 s8, s79, 8 +; VI-NEXT: v_readlane_b32 s10, v22, 24 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_readlane_b32 s8, v22, 16 -; VI-NEXT: v_readlane_b32 s10, v22, 4 -; VI-NEXT: s_and_b32 s8, s8, 0xff +; VI-NEXT: s_and_b32 s8, s48, 0xff ; VI-NEXT: s_lshl_b32 s9, s10, 8 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s6, s6, 0xffff @@ -83963,12 +83770,10 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_and_b32 s6, s7, 0xff -; VI-NEXT: v_readlane_b32 s7, v22, 15 -; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_lshl_b32 s7, s49, 8 +; VI-NEXT: v_readlane_b32 s8, v22, 34 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 14 -; VI-NEXT: v_readlane_b32 s8, v22, 13 -; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_and_b32 s7, s91, 0xff ; VI-NEXT: s_lshl_b32 s8, s8, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_and_b32 s6, s6, 0xffff @@ -83977,12 +83782,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_readlane_b32 s6, v22, 12 +; VI-NEXT: v_readlane_b32 s6, v22, 33 ; VI-NEXT: s_and_b32 s4, s4, 0xff ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_readlane_b32 s6, v22, 11 -; VI-NEXT: v_readlane_b32 s8, v22, 6 +; VI-NEXT: v_readlane_b32 s6, v22, 32 +; VI-NEXT: v_readlane_b32 s8, v22, 26 ; VI-NEXT: s_and_b32 s6, s6, 0xff ; VI-NEXT: s_lshl_b32 s7, s8, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 @@ -83993,11 +83798,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_and_b32 s4, s5, 0xff -; VI-NEXT: v_readlane_b32 s5, v22, 10 +; VI-NEXT: v_readlane_b32 s5, v22, 31 ; VI-NEXT: s_lshl_b32 s5, s5, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_readlane_b32 s5, v22, 9 -; VI-NEXT: v_readlane_b32 s6, v22, 8 +; VI-NEXT: v_readlane_b32 s5, v22, 30 +; VI-NEXT: v_readlane_b32 s6, v22, 29 ; VI-NEXT: s_and_b32 s5, s5, 0xff ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 @@ -84008,10 +83813,20 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_readlane_b32 s15, v22, 1 -; VI-NEXT: v_readlane_b32 s13, v22, 3 -; VI-NEXT: v_readlane_b32 s11, v22, 5 -; VI-NEXT: v_readlane_b32 s9, v22, 7 +; VI-NEXT: v_readlane_b32 s47, v22, 1 +; VI-NEXT: v_readlane_b32 s45, v22, 3 +; VI-NEXT: v_readlane_b32 s43, v22, 5 +; VI-NEXT: v_readlane_b32 s41, v22, 7 +; VI-NEXT: v_readlane_b32 s27, v22, 9 +; VI-NEXT: v_readlane_b32 s25, v22, 11 +; VI-NEXT: v_readlane_b32 s23, v22, 13 +; VI-NEXT: v_readlane_b32 s21, v22, 15 +; VI-NEXT: v_readlane_b32 s19, v22, 17 +; VI-NEXT: v_readlane_b32 s17, v22, 19 +; VI-NEXT: v_readlane_b32 s15, v22, 21 +; VI-NEXT: v_readlane_b32 s13, v22, 23 +; VI-NEXT: v_readlane_b32 s11, v22, 25 +; VI-NEXT: v_readlane_b32 s9, v22, 27 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_readlane_b32 s87, v21, 31 ; VI-NEXT: v_readlane_b32 s86, v21, 30 @@ -84051,164 +83866,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB57_4: -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr64 -; VI-NEXT: ; implicit-def: $sgpr55 -; VI-NEXT: ; implicit-def: $sgpr59 -; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr54 -; VI-NEXT: ; implicit-def: $sgpr53 -; VI-NEXT: ; implicit-def: $sgpr52 -; VI-NEXT: ; implicit-def: $sgpr51 -; VI-NEXT: ; implicit-def: $sgpr29 -; VI-NEXT: ; implicit-def: $sgpr28 -; VI-NEXT: ; implicit-def: $sgpr27 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr50 -; VI-NEXT: ; implicit-def: $sgpr87 -; VI-NEXT: ; implicit-def: $sgpr86 -; VI-NEXT: ; implicit-def: $sgpr85 -; VI-NEXT: ; implicit-def: $sgpr84 -; VI-NEXT: ; implicit-def: $sgpr83 -; VI-NEXT: ; implicit-def: $sgpr82 -; VI-NEXT: ; implicit-def: $sgpr81 -; VI-NEXT: ; implicit-def: $sgpr80 -; VI-NEXT: ; implicit-def: $sgpr71 -; VI-NEXT: ; implicit-def: $sgpr70 -; VI-NEXT: ; implicit-def: $sgpr69 -; VI-NEXT: ; implicit-def: $sgpr68 -; VI-NEXT: ; implicit-def: $sgpr67 -; VI-NEXT: ; implicit-def: $sgpr66 -; VI-NEXT: ; implicit-def: $sgpr48 -; VI-NEXT: ; implicit-def: $sgpr38 -; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr30 -; VI-NEXT: ; implicit-def: $sgpr90 -; VI-NEXT: ; implicit-def: $sgpr88 -; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; kill: killed $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v22, s60, 0 -; VI-NEXT: v_writelane_b32 v22, s61, 1 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v22, s60, 2 -; VI-NEXT: v_writelane_b32 v22, s61, 3 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v22, s60, 4 -; VI-NEXT: v_writelane_b32 v22, s61, 5 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v22, s60, 6 -; VI-NEXT: v_writelane_b32 v22, s61, 7 -; VI-NEXT: s_branch .LBB57_2 ; ; GFX9-LABEL: bitcast_v16i64_to_v128i8_scalar: ; GFX9: ; %bb.0: @@ -84305,152 +83962,154 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_cbranch_scc0 .LBB57_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b32 s26, s5, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 2 -; GFX9-NEXT: s_lshr_b32 s26, s5, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 3 -; GFX9-NEXT: s_lshr_b32 s26, s5, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 4 -; GFX9-NEXT: s_lshr_b32 s26, s4, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 5 -; GFX9-NEXT: s_lshr_b32 s26, s4, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 6 -; GFX9-NEXT: s_lshr_b32 s26, s7, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 7 -; GFX9-NEXT: s_lshr_b32 s26, s7, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 8 -; GFX9-NEXT: s_lshr_b32 s26, s7, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 9 -; GFX9-NEXT: s_lshr_b32 s26, s6, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 10 -; GFX9-NEXT: s_lshr_b32 s26, s6, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 11 -; GFX9-NEXT: s_lshr_b32 s26, s9, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 12 -; GFX9-NEXT: s_lshr_b32 s26, s9, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 13 -; GFX9-NEXT: s_lshr_b32 s26, s9, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 14 -; GFX9-NEXT: s_lshr_b32 s26, s8, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 15 -; GFX9-NEXT: s_lshr_b32 s26, s8, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 16 -; GFX9-NEXT: s_lshr_b32 s26, s11, 24 +; GFX9-NEXT: s_lshr_b32 s26, s8, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 17 -; GFX9-NEXT: s_lshr_b32 s26, s11, 16 +; GFX9-NEXT: s_lshr_b32 s26, s8, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 18 -; GFX9-NEXT: s_lshr_b32 s26, s11, 8 +; GFX9-NEXT: s_lshr_b32 s26, s11, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 19 -; GFX9-NEXT: s_lshr_b32 s26, s10, 16 +; GFX9-NEXT: s_lshr_b32 s26, s11, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 20 -; GFX9-NEXT: s_lshr_b32 s26, s10, 8 +; GFX9-NEXT: s_lshr_b32 s26, s11, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 21 -; GFX9-NEXT: s_lshr_b32 s26, s13, 24 +; GFX9-NEXT: s_lshr_b32 s26, s10, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 22 -; GFX9-NEXT: s_lshr_b32 s26, s13, 16 +; GFX9-NEXT: s_lshr_b32 s26, s10, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 23 -; GFX9-NEXT: s_lshr_b32 s26, s13, 8 +; GFX9-NEXT: s_lshr_b32 s26, s13, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 24 -; GFX9-NEXT: s_lshr_b32 s26, s12, 16 +; GFX9-NEXT: s_lshr_b32 s26, s13, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 25 -; GFX9-NEXT: s_lshr_b32 s26, s12, 8 +; GFX9-NEXT: s_lshr_b32 s26, s13, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 26 -; GFX9-NEXT: s_lshr_b32 s26, s15, 24 +; GFX9-NEXT: s_lshr_b32 s26, s12, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 27 -; GFX9-NEXT: s_lshr_b32 s26, s15, 16 +; GFX9-NEXT: s_lshr_b32 s26, s12, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 28 -; GFX9-NEXT: s_lshr_b32 s26, s15, 8 +; GFX9-NEXT: s_lshr_b32 s26, s15, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 29 -; GFX9-NEXT: s_lshr_b32 s26, s14, 16 +; GFX9-NEXT: s_lshr_b32 s26, s15, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 30 -; GFX9-NEXT: s_lshr_b32 s26, s14, 8 +; GFX9-NEXT: s_lshr_b32 s26, s15, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 31 -; GFX9-NEXT: s_lshr_b32 s26, s17, 24 +; GFX9-NEXT: s_lshr_b32 s26, s14, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 32 -; GFX9-NEXT: s_lshr_b32 s26, s17, 16 +; GFX9-NEXT: s_lshr_b32 s26, s14, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 33 -; GFX9-NEXT: s_lshr_b32 s26, s17, 8 +; GFX9-NEXT: s_lshr_b32 s26, s17, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 34 -; GFX9-NEXT: s_lshr_b32 s26, s16, 16 +; GFX9-NEXT: s_lshr_b32 s26, s17, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 35 -; GFX9-NEXT: s_lshr_b32 s26, s16, 8 +; GFX9-NEXT: s_lshr_b32 s26, s17, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 36 -; GFX9-NEXT: s_lshr_b32 s26, s19, 24 +; GFX9-NEXT: s_lshr_b32 s26, s16, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 37 -; GFX9-NEXT: s_lshr_b32 s26, s19, 16 +; GFX9-NEXT: s_lshr_b32 s26, s16, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 38 -; GFX9-NEXT: s_lshr_b32 s26, s19, 8 +; GFX9-NEXT: s_lshr_b32 s26, s19, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 39 -; GFX9-NEXT: s_lshr_b32 s26, s18, 16 +; GFX9-NEXT: s_lshr_b32 s26, s19, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 40 -; GFX9-NEXT: s_lshr_b32 s26, s18, 8 +; GFX9-NEXT: s_lshr_b32 s26, s19, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 41 -; GFX9-NEXT: s_lshr_b32 s26, s21, 24 +; GFX9-NEXT: s_lshr_b32 s26, s18, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 42 -; GFX9-NEXT: s_lshr_b32 s26, s21, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 43 -; GFX9-NEXT: s_lshr_b32 s26, s21, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 44 -; GFX9-NEXT: s_lshr_b32 s26, s20, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 45 -; GFX9-NEXT: s_lshr_b32 s26, s20, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 46 -; GFX9-NEXT: s_lshr_b32 s26, s23, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 47 -; GFX9-NEXT: s_lshr_b32 s26, s23, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 48 -; GFX9-NEXT: s_lshr_b32 s26, s23, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 49 -; GFX9-NEXT: s_lshr_b32 s26, s22, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 50 -; GFX9-NEXT: s_lshr_b64 s[28:29], s[4:5], 24 -; GFX9-NEXT: v_writelane_b32 v22, s28, 0 -; GFX9-NEXT: s_lshr_b32 s82, s22, 8 -; GFX9-NEXT: s_lshr_b32 s83, s25, 24 -; GFX9-NEXT: s_lshr_b32 s81, s25, 16 -; GFX9-NEXT: s_lshr_b32 s84, s25, 8 -; GFX9-NEXT: s_lshr_b32 s85, s24, 16 -; GFX9-NEXT: s_lshr_b32 s86, s24, 8 -; GFX9-NEXT: s_lshr_b32 s87, s41, 24 -; GFX9-NEXT: s_lshr_b32 s96, s41, 16 -; GFX9-NEXT: s_lshr_b32 s97, s41, 8 -; GFX9-NEXT: s_lshr_b32 s98, s40, 16 -; GFX9-NEXT: s_lshr_b32 s99, s40, 8 -; GFX9-NEXT: s_lshr_b32 s38, s43, 24 -; GFX9-NEXT: s_lshr_b32 s39, s43, 16 -; GFX9-NEXT: s_lshr_b32 s48, s43, 8 -; GFX9-NEXT: s_lshr_b32 s49, s42, 16 -; GFX9-NEXT: s_lshr_b32 s50, s42, 8 -; GFX9-NEXT: s_lshr_b32 s51, s45, 24 -; GFX9-NEXT: s_lshr_b32 s52, s45, 16 -; GFX9-NEXT: s_lshr_b32 s53, s45, 8 -; GFX9-NEXT: s_lshr_b32 s54, s44, 16 -; GFX9-NEXT: s_lshr_b32 s55, s44, 8 -; GFX9-NEXT: s_lshr_b32 s64, s47, 24 -; GFX9-NEXT: s_lshr_b32 s65, s47, 16 -; GFX9-NEXT: s_lshr_b32 s66, s47, 8 -; GFX9-NEXT: s_lshr_b32 s67, s46, 16 -; GFX9-NEXT: s_lshr_b32 s68, s46, 8 -; GFX9-NEXT: s_lshr_b32 s69, s57, 24 -; GFX9-NEXT: s_lshr_b32 s70, s57, 16 -; GFX9-NEXT: s_lshr_b32 s71, s57, 8 -; GFX9-NEXT: s_lshr_b32 s80, s56, 16 -; GFX9-NEXT: s_lshr_b32 s26, s56, 8 -; GFX9-NEXT: v_writelane_b32 v22, s29, 1 -; GFX9-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 -; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 -; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 +; GFX9-NEXT: v_writelane_b32 v22, s78, 14 +; GFX9-NEXT: v_writelane_b32 v22, s79, 15 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; GFX9-NEXT: v_writelane_b32 v22, s78, 12 +; GFX9-NEXT: v_writelane_b32 v22, s79, 13 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; GFX9-NEXT: v_writelane_b32 v22, s78, 10 +; GFX9-NEXT: v_writelane_b32 v22, s79, 11 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[10:11], 24 +; GFX9-NEXT: v_writelane_b32 v22, s78, 8 +; GFX9-NEXT: v_writelane_b32 v22, s79, 9 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 +; GFX9-NEXT: v_writelane_b32 v22, s78, 6 +; GFX9-NEXT: v_writelane_b32 v22, s79, 7 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[14:15], 24 +; GFX9-NEXT: v_writelane_b32 v22, s78, 4 +; GFX9-NEXT: v_writelane_b32 v22, s79, 5 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[16:17], 24 +; GFX9-NEXT: v_writelane_b32 v22, s78, 2 +; GFX9-NEXT: v_writelane_b32 v22, s79, 3 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[18:19], 24 +; GFX9-NEXT: v_writelane_b32 v22, s78, 0 +; GFX9-NEXT: s_lshr_b32 s88, s5, 16 +; GFX9-NEXT: v_writelane_b32 v22, s79, 1 ; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX9-NEXT: s_lshr_b32 s90, s5, 8 +; GFX9-NEXT: s_mov_b32 s79, s88 ; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; GFX9-NEXT: s_lshr_b32 s92, s4, 16 +; GFX9-NEXT: s_mov_b32 s89, s90 ; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX9-NEXT: s_lshr_b32 s94, s4, 8 +; GFX9-NEXT: s_lshr_b32 s36, s7, 8 +; GFX9-NEXT: s_lshr_b32 s54, s6, 8 +; GFX9-NEXT: s_lshr_b32 s76, s9, 8 +; GFX9-NEXT: s_mov_b32 s91, s92 ; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; GFX9-NEXT: s_lshr_b32 vcc_lo, s7, 24 +; GFX9-NEXT: s_lshr_b32 vcc_hi, s7, 16 +; GFX9-NEXT: s_lshr_b32 s75, s9, 16 +; GFX9-NEXT: s_mov_b32 s93, s94 ; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 ; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 +; GFX9-NEXT: s_mov_b32 s35, s36 +; GFX9-NEXT: s_mov_b32 s36, s54 +; GFX9-NEXT: s_mov_b32 s54, s76 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[56:57], 24 +; GFX9-NEXT: s_lshr_b32 s37, s6, 16 +; GFX9-NEXT: s_lshr_b32 s55, s9, 24 +; GFX9-NEXT: s_lshr_b32 s53, s18, 16 +; GFX9-NEXT: s_lshr_b32 s64, s21, 24 +; GFX9-NEXT: s_lshr_b32 s65, s21, 16 +; GFX9-NEXT: s_lshr_b32 s66, s21, 8 +; GFX9-NEXT: s_lshr_b32 s67, s20, 16 +; GFX9-NEXT: s_lshr_b32 s68, s20, 8 +; GFX9-NEXT: s_lshr_b32 s69, s23, 24 +; GFX9-NEXT: s_lshr_b32 s70, s23, 16 +; GFX9-NEXT: s_lshr_b32 s71, s23, 8 +; GFX9-NEXT: s_lshr_b32 s80, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s25, 24 +; GFX9-NEXT: s_lshr_b32 s81, s25, 16 +; GFX9-NEXT: s_lshr_b32 s82, s25, 8 +; GFX9-NEXT: s_lshr_b32 s83, s24, 16 +; GFX9-NEXT: s_lshr_b32 s28, s24, 8 +; GFX9-NEXT: s_lshr_b32 s29, s41, 24 +; GFX9-NEXT: s_lshr_b32 s84, s41, 16 +; GFX9-NEXT: s_lshr_b32 s85, s41, 8 +; GFX9-NEXT: s_lshr_b32 s86, s40, 16 +; GFX9-NEXT: s_lshr_b32 s58, s40, 8 +; GFX9-NEXT: s_lshr_b32 s59, s43, 24 +; GFX9-NEXT: s_lshr_b32 s87, s43, 16 +; GFX9-NEXT: s_lshr_b32 s96, s43, 8 +; GFX9-NEXT: s_lshr_b32 s97, s42, 16 +; GFX9-NEXT: s_lshr_b32 s60, s42, 8 +; GFX9-NEXT: s_lshr_b32 s61, s45, 24 +; GFX9-NEXT: s_lshr_b32 s98, s45, 16 +; GFX9-NEXT: s_lshr_b32 s99, s45, 8 +; GFX9-NEXT: s_lshr_b32 s38, s44, 16 +; GFX9-NEXT: s_lshr_b32 s62, s44, 8 +; GFX9-NEXT: s_lshr_b32 s63, s47, 24 +; GFX9-NEXT: s_lshr_b32 s39, s47, 16 +; GFX9-NEXT: s_lshr_b32 s48, s47, 8 +; GFX9-NEXT: s_lshr_b32 s49, s46, 16 +; GFX9-NEXT: s_lshr_b32 s72, s46, 8 +; GFX9-NEXT: s_lshr_b32 s73, s57, 24 +; GFX9-NEXT: s_lshr_b32 s50, s57, 16 +; GFX9-NEXT: s_lshr_b32 s51, s57, 8 +; GFX9-NEXT: s_lshr_b32 s52, s56, 16 +; GFX9-NEXT: s_lshr_b32 s74, s56, 8 +; GFX9-NEXT: s_mov_b32 s95, vcc_lo +; GFX9-NEXT: s_mov_b32 s31, vcc_hi +; GFX9-NEXT: s_mov_b32 s77, s75 ; GFX9-NEXT: s_cbranch_execnz .LBB57_3 ; GFX9-NEXT: .LBB57_2: ; %cmp.true ; GFX9-NEXT: s_add_u32 s56, s56, 3 @@ -84485,297 +84144,265 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 ; GFX9-NEXT: s_add_u32 s4, s4, 3 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: s_lshr_b32 s26, s5, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 2 -; GFX9-NEXT: s_lshr_b32 s26, s5, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 3 -; GFX9-NEXT: s_lshr_b32 s26, s5, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 4 -; GFX9-NEXT: s_lshr_b32 s26, s4, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 5 -; GFX9-NEXT: s_lshr_b32 s26, s4, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 6 -; GFX9-NEXT: s_lshr_b32 s26, s7, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 7 -; GFX9-NEXT: s_lshr_b32 s26, s7, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 8 -; GFX9-NEXT: s_lshr_b32 s26, s7, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 9 -; GFX9-NEXT: s_lshr_b32 s26, s6, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 10 -; GFX9-NEXT: s_lshr_b32 s26, s6, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 11 -; GFX9-NEXT: s_lshr_b32 s26, s9, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 12 -; GFX9-NEXT: s_lshr_b32 s26, s9, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 13 -; GFX9-NEXT: s_lshr_b32 s26, s9, 8 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 14 -; GFX9-NEXT: s_lshr_b32 s26, s8, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 15 -; GFX9-NEXT: s_lshr_b32 s26, s8, 8 +; GFX9-NEXT: v_writelane_b32 v22, s27, 15 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 12 +; GFX9-NEXT: v_writelane_b32 v22, s27, 13 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 10 +; GFX9-NEXT: v_writelane_b32 v22, s27, 11 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 8 +; GFX9-NEXT: v_writelane_b32 v22, s27, 9 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 6 +; GFX9-NEXT: v_writelane_b32 v22, s27, 7 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 4 +; GFX9-NEXT: v_writelane_b32 v22, s27, 5 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 2 +; GFX9-NEXT: v_writelane_b32 v22, s27, 3 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[18:19], 24 +; GFX9-NEXT: v_writelane_b32 v22, s26, 0 +; GFX9-NEXT: v_writelane_b32 v22, s27, 1 +; GFX9-NEXT: s_lshr_b32 s26, s5, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 16 -; GFX9-NEXT: s_lshr_b32 s26, s11, 24 +; GFX9-NEXT: s_lshr_b32 s26, s8, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 17 -; GFX9-NEXT: s_lshr_b32 s26, s11, 16 +; GFX9-NEXT: s_lshr_b32 s26, s8, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 18 -; GFX9-NEXT: s_lshr_b32 s26, s11, 8 +; GFX9-NEXT: s_lshr_b32 s26, s11, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 19 -; GFX9-NEXT: s_lshr_b32 s26, s10, 16 +; GFX9-NEXT: s_lshr_b32 s26, s11, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 20 -; GFX9-NEXT: s_lshr_b32 s26, s10, 8 +; GFX9-NEXT: s_lshr_b32 s26, s11, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 21 -; GFX9-NEXT: s_lshr_b32 s26, s13, 24 +; GFX9-NEXT: s_lshr_b32 s26, s10, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 22 -; GFX9-NEXT: s_lshr_b32 s26, s13, 16 +; GFX9-NEXT: s_lshr_b32 s26, s10, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 23 -; GFX9-NEXT: s_lshr_b32 s26, s13, 8 +; GFX9-NEXT: s_lshr_b32 s26, s13, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 24 -; GFX9-NEXT: s_lshr_b32 s26, s12, 16 +; GFX9-NEXT: s_lshr_b32 s26, s13, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 25 -; GFX9-NEXT: s_lshr_b32 s26, s12, 8 +; GFX9-NEXT: s_lshr_b32 s26, s13, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 26 -; GFX9-NEXT: s_lshr_b32 s26, s15, 24 +; GFX9-NEXT: s_lshr_b32 s26, s12, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 27 -; GFX9-NEXT: s_lshr_b32 s26, s15, 16 +; GFX9-NEXT: s_lshr_b32 s26, s12, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 28 -; GFX9-NEXT: s_lshr_b32 s26, s15, 8 +; GFX9-NEXT: s_lshr_b32 s26, s15, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 29 -; GFX9-NEXT: s_lshr_b32 s26, s14, 16 +; GFX9-NEXT: s_lshr_b32 s26, s15, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 30 -; GFX9-NEXT: s_lshr_b32 s26, s14, 8 +; GFX9-NEXT: s_lshr_b32 s26, s15, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 31 -; GFX9-NEXT: s_lshr_b32 s26, s17, 24 +; GFX9-NEXT: s_lshr_b32 s26, s14, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 32 -; GFX9-NEXT: s_lshr_b32 s26, s17, 16 +; GFX9-NEXT: s_lshr_b32 s26, s14, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 33 -; GFX9-NEXT: s_lshr_b32 s26, s17, 8 +; GFX9-NEXT: s_lshr_b32 s26, s17, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 34 -; GFX9-NEXT: s_lshr_b32 s26, s16, 16 +; GFX9-NEXT: s_lshr_b32 s26, s17, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 35 -; GFX9-NEXT: s_lshr_b32 s26, s16, 8 +; GFX9-NEXT: s_lshr_b32 s26, s17, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 36 -; GFX9-NEXT: s_lshr_b32 s26, s19, 24 +; GFX9-NEXT: s_lshr_b32 s26, s16, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 37 -; GFX9-NEXT: s_lshr_b32 s26, s19, 16 +; GFX9-NEXT: s_lshr_b32 s26, s16, 8 ; GFX9-NEXT: v_writelane_b32 v22, s26, 38 -; GFX9-NEXT: s_lshr_b32 s26, s19, 8 +; GFX9-NEXT: s_lshr_b32 s26, s19, 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 39 -; GFX9-NEXT: s_lshr_b32 s26, s18, 16 +; GFX9-NEXT: s_lshr_b32 s26, s19, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 40 -; GFX9-NEXT: s_lshr_b32 s26, s18, 8 +; GFX9-NEXT: s_lshr_b32 s26, s19, 8 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[56:57], 24 ; GFX9-NEXT: v_writelane_b32 v22, s26, 41 -; GFX9-NEXT: s_lshr_b32 s26, s21, 24 +; GFX9-NEXT: s_lshr_b32 s26, s18, 8 +; GFX9-NEXT: s_lshr_b32 s79, s5, 16 +; GFX9-NEXT: s_lshr_b32 s89, s5, 8 +; GFX9-NEXT: s_lshr_b32 s91, s4, 16 +; GFX9-NEXT: s_lshr_b32 s93, s4, 8 +; GFX9-NEXT: s_lshr_b32 s95, s7, 24 +; GFX9-NEXT: s_lshr_b32 s31, s7, 16 +; GFX9-NEXT: s_lshr_b32 s35, s7, 8 +; GFX9-NEXT: s_lshr_b32 s37, s6, 16 +; GFX9-NEXT: s_lshr_b32 s36, s6, 8 +; GFX9-NEXT: s_lshr_b32 s55, s9, 24 +; GFX9-NEXT: s_lshr_b32 s77, s9, 16 +; GFX9-NEXT: s_lshr_b32 s54, s9, 8 +; GFX9-NEXT: s_lshr_b32 s53, s18, 16 ; GFX9-NEXT: v_writelane_b32 v22, s26, 42 -; GFX9-NEXT: s_lshr_b32 s26, s21, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 43 -; GFX9-NEXT: s_lshr_b32 s26, s21, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 44 -; GFX9-NEXT: s_lshr_b32 s26, s20, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 45 -; GFX9-NEXT: s_lshr_b32 s26, s20, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 46 -; GFX9-NEXT: s_lshr_b32 s26, s23, 24 -; GFX9-NEXT: v_writelane_b32 v22, s26, 47 -; GFX9-NEXT: s_lshr_b32 s26, s23, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 48 -; GFX9-NEXT: s_lshr_b32 s26, s23, 8 -; GFX9-NEXT: v_writelane_b32 v22, s26, 49 -; GFX9-NEXT: s_lshr_b32 s26, s22, 16 -; GFX9-NEXT: v_writelane_b32 v22, s26, 50 -; GFX9-NEXT: s_lshr_b64 s[28:29], s[4:5], 24 -; GFX9-NEXT: v_writelane_b32 v22, s28, 0 -; GFX9-NEXT: s_lshr_b32 s82, s22, 8 -; GFX9-NEXT: s_lshr_b32 s83, s25, 24 +; GFX9-NEXT: s_lshr_b32 s64, s21, 24 +; GFX9-NEXT: s_lshr_b32 s65, s21, 16 +; GFX9-NEXT: s_lshr_b32 s66, s21, 8 +; GFX9-NEXT: s_lshr_b32 s67, s20, 16 +; GFX9-NEXT: s_lshr_b32 s68, s20, 8 +; GFX9-NEXT: s_lshr_b32 s69, s23, 24 +; GFX9-NEXT: s_lshr_b32 s70, s23, 16 +; GFX9-NEXT: s_lshr_b32 s71, s23, 8 +; GFX9-NEXT: s_lshr_b32 s80, s22, 16 +; GFX9-NEXT: s_lshr_b32 s26, s22, 8 +; GFX9-NEXT: s_lshr_b32 s27, s25, 24 ; GFX9-NEXT: s_lshr_b32 s81, s25, 16 -; GFX9-NEXT: s_lshr_b32 s84, s25, 8 -; GFX9-NEXT: s_lshr_b32 s85, s24, 16 -; GFX9-NEXT: s_lshr_b32 s86, s24, 8 -; GFX9-NEXT: s_lshr_b32 s87, s41, 24 -; GFX9-NEXT: s_lshr_b32 s96, s41, 16 -; GFX9-NEXT: s_lshr_b32 s97, s41, 8 -; GFX9-NEXT: s_lshr_b32 s98, s40, 16 -; GFX9-NEXT: s_lshr_b32 s99, s40, 8 -; GFX9-NEXT: s_lshr_b32 s38, s43, 24 -; GFX9-NEXT: s_lshr_b32 s39, s43, 16 -; GFX9-NEXT: s_lshr_b32 s48, s43, 8 -; GFX9-NEXT: s_lshr_b32 s49, s42, 16 -; GFX9-NEXT: s_lshr_b32 s50, s42, 8 -; GFX9-NEXT: s_lshr_b32 s51, s45, 24 -; GFX9-NEXT: s_lshr_b32 s52, s45, 16 -; GFX9-NEXT: s_lshr_b32 s53, s45, 8 -; GFX9-NEXT: s_lshr_b32 s54, s44, 16 -; GFX9-NEXT: s_lshr_b32 s55, s44, 8 -; GFX9-NEXT: s_lshr_b32 s64, s47, 24 -; GFX9-NEXT: s_lshr_b32 s65, s47, 16 -; GFX9-NEXT: s_lshr_b32 s66, s47, 8 -; GFX9-NEXT: s_lshr_b32 s67, s46, 16 -; GFX9-NEXT: s_lshr_b32 s68, s46, 8 -; GFX9-NEXT: s_lshr_b32 s69, s57, 24 -; GFX9-NEXT: s_lshr_b32 s70, s57, 16 -; GFX9-NEXT: s_lshr_b32 s71, s57, 8 -; GFX9-NEXT: s_lshr_b32 s80, s56, 16 -; GFX9-NEXT: s_lshr_b32 s26, s56, 8 -; GFX9-NEXT: v_writelane_b32 v22, s29, 1 -; GFX9-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 -; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 -; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 +; GFX9-NEXT: s_lshr_b32 s82, s25, 8 +; GFX9-NEXT: s_lshr_b32 s83, s24, 16 +; GFX9-NEXT: s_lshr_b32 s28, s24, 8 +; GFX9-NEXT: s_lshr_b32 s29, s41, 24 +; GFX9-NEXT: s_lshr_b32 s84, s41, 16 +; GFX9-NEXT: s_lshr_b32 s85, s41, 8 +; GFX9-NEXT: s_lshr_b32 s86, s40, 16 +; GFX9-NEXT: s_lshr_b32 s58, s40, 8 +; GFX9-NEXT: s_lshr_b32 s59, s43, 24 +; GFX9-NEXT: s_lshr_b32 s87, s43, 16 +; GFX9-NEXT: s_lshr_b32 s96, s43, 8 +; GFX9-NEXT: s_lshr_b32 s97, s42, 16 +; GFX9-NEXT: s_lshr_b32 s60, s42, 8 +; GFX9-NEXT: s_lshr_b32 s61, s45, 24 +; GFX9-NEXT: s_lshr_b32 s98, s45, 16 +; GFX9-NEXT: s_lshr_b32 s99, s45, 8 +; GFX9-NEXT: s_lshr_b32 s38, s44, 16 +; GFX9-NEXT: s_lshr_b32 s62, s44, 8 +; GFX9-NEXT: s_lshr_b32 s63, s47, 24 +; GFX9-NEXT: s_lshr_b32 s39, s47, 16 +; GFX9-NEXT: s_lshr_b32 s48, s47, 8 +; GFX9-NEXT: s_lshr_b32 s49, s46, 16 +; GFX9-NEXT: s_lshr_b32 s72, s46, 8 +; GFX9-NEXT: s_lshr_b32 s73, s57, 24 +; GFX9-NEXT: s_lshr_b32 s50, s57, 16 +; GFX9-NEXT: s_lshr_b32 s51, s57, 8 +; GFX9-NEXT: s_lshr_b32 s52, s56, 16 +; GFX9-NEXT: s_lshr_b32 s74, s56, 8 ; GFX9-NEXT: .LBB57_3: ; %end -; GFX9-NEXT: s_lshl_b32 s26, s26, 8 -; GFX9-NEXT: s_and_b32 s27, s56, 0xff -; GFX9-NEXT: s_or_b32 s26, s27, s26 -; GFX9-NEXT: s_lshl_b32 s27, s36, 8 -; GFX9-NEXT: s_and_b32 s29, s80, 0xff -; GFX9-NEXT: s_or_b32 s27, s29, s27 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v1, s26 -; GFX9-NEXT: s_and_b32 s26, s57, 0xff -; GFX9-NEXT: s_lshl_b32 s27, s71, 8 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: s_and_b32 s27, s70, 0xff -; GFX9-NEXT: s_lshl_b32 s29, s69, 8 -; GFX9-NEXT: s_or_b32 s27, s27, s29 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v2, s26 -; GFX9-NEXT: s_lshl_b32 s26, s68, 8 -; GFX9-NEXT: s_and_b32 s27, s46, 0xff -; GFX9-NEXT: s_or_b32 s26, s27, s26 -; GFX9-NEXT: s_lshl_b32 s27, s34, 8 -; GFX9-NEXT: s_and_b32 s29, s67, 0xff -; GFX9-NEXT: s_or_b32 s27, s29, s27 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v3, s26 -; GFX9-NEXT: s_and_b32 s26, s47, 0xff -; GFX9-NEXT: s_lshl_b32 s27, s66, 8 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: s_and_b32 s27, s65, 0xff -; GFX9-NEXT: s_lshl_b32 s29, s64, 8 -; GFX9-NEXT: s_or_b32 s27, s27, s29 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v4, s26 -; GFX9-NEXT: s_lshl_b32 s26, s55, 8 -; GFX9-NEXT: s_and_b32 s27, s44, 0xff -; GFX9-NEXT: s_or_b32 s26, s27, s26 -; GFX9-NEXT: s_lshl_b32 s27, s30, 8 -; GFX9-NEXT: s_and_b32 s29, s54, 0xff -; GFX9-NEXT: s_or_b32 s27, s29, s27 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v5, s26 -; GFX9-NEXT: s_and_b32 s26, s45, 0xff -; GFX9-NEXT: s_lshl_b32 s27, s53, 8 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: s_and_b32 s27, s52, 0xff -; GFX9-NEXT: s_lshl_b32 s29, s51, 8 -; GFX9-NEXT: s_or_b32 s27, s27, s29 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v6, s26 -; GFX9-NEXT: s_lshl_b32 s26, s50, 8 -; GFX9-NEXT: s_and_b32 s27, s42, 0xff -; GFX9-NEXT: s_or_b32 s26, s27, s26 -; GFX9-NEXT: s_lshl_b32 s27, s94, 8 -; GFX9-NEXT: s_and_b32 s29, s49, 0xff -; GFX9-NEXT: s_or_b32 s27, s29, s27 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v7, s26 -; GFX9-NEXT: s_and_b32 s26, s43, 0xff -; GFX9-NEXT: s_lshl_b32 s27, s48, 8 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: s_and_b32 s27, s39, 0xff -; GFX9-NEXT: s_lshl_b32 s29, s38, 8 -; GFX9-NEXT: s_or_b32 s27, s27, s29 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v8, s26 -; GFX9-NEXT: s_lshl_b32 s26, s99, 8 -; GFX9-NEXT: s_and_b32 s27, s40, 0xff -; GFX9-NEXT: s_or_b32 s26, s27, s26 -; GFX9-NEXT: s_lshl_b32 s27, s92, 8 -; GFX9-NEXT: s_and_b32 s29, s98, 0xff -; GFX9-NEXT: s_or_b32 s27, s29, s27 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v9, s26 -; GFX9-NEXT: s_and_b32 s26, s41, 0xff -; GFX9-NEXT: s_lshl_b32 s27, s97, 8 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: s_and_b32 s27, s96, 0xff -; GFX9-NEXT: s_lshl_b32 s29, s87, 8 -; GFX9-NEXT: s_or_b32 s27, s27, s29 -; GFX9-NEXT: s_and_b32 s26, s26, 0xffff -; GFX9-NEXT: s_lshl_b32 s27, s27, 16 -; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_mov_b32_e32 v10, s26 -; GFX9-NEXT: s_lshl_b32 s26, s86, 8 +; GFX9-NEXT: s_and_b32 s56, s56, 0xff +; GFX9-NEXT: s_lshl_b32 s74, s74, 8 +; GFX9-NEXT: s_or_b32 s56, s56, s74 +; GFX9-NEXT: s_and_b32 s74, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s75, s76, 8 +; GFX9-NEXT: s_or_b32 s74, s74, s75 +; GFX9-NEXT: s_and_b32 s56, s56, 0xffff +; GFX9-NEXT: s_lshl_b32 s74, s74, 16 +; GFX9-NEXT: s_or_b32 s56, s56, s74 +; GFX9-NEXT: v_mov_b32_e32 v1, s56 +; GFX9-NEXT: s_and_b32 s56, s57, 0xff +; GFX9-NEXT: s_lshl_b32 s57, s51, 8 +; GFX9-NEXT: s_or_b32 s56, s56, s57 +; GFX9-NEXT: s_and_b32 s57, s50, 0xff +; GFX9-NEXT: s_lshl_b32 s73, s73, 8 +; GFX9-NEXT: s_or_b32 s57, s57, s73 +; GFX9-NEXT: s_and_b32 s56, s56, 0xffff +; GFX9-NEXT: s_lshl_b32 s57, s57, 16 +; GFX9-NEXT: s_or_b32 s56, s56, s57 +; GFX9-NEXT: v_mov_b32_e32 v2, s56 +; GFX9-NEXT: s_and_b32 s46, s46, 0xff +; GFX9-NEXT: s_lshl_b32 s56, s72, 8 +; GFX9-NEXT: s_or_b32 s46, s46, s56 +; GFX9-NEXT: s_and_b32 s56, s49, 0xff +; GFX9-NEXT: s_lshl_b32 s57, s34, 8 +; GFX9-NEXT: s_or_b32 s56, s56, s57 +; GFX9-NEXT: s_and_b32 s46, s46, 0xffff +; GFX9-NEXT: s_lshl_b32 s56, s56, 16 +; GFX9-NEXT: s_or_b32 s46, s46, s56 +; GFX9-NEXT: v_mov_b32_e32 v3, s46 +; GFX9-NEXT: s_and_b32 s46, s47, 0xff +; GFX9-NEXT: s_lshl_b32 s47, s48, 8 +; GFX9-NEXT: s_or_b32 s46, s46, s47 +; GFX9-NEXT: s_and_b32 s47, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s56, s63, 8 +; GFX9-NEXT: s_or_b32 s47, s47, s56 +; GFX9-NEXT: s_and_b32 s46, s46, 0xffff +; GFX9-NEXT: s_lshl_b32 s47, s47, 16 +; GFX9-NEXT: s_or_b32 s46, s46, s47 +; GFX9-NEXT: v_mov_b32_e32 v4, s46 +; GFX9-NEXT: s_and_b32 s44, s44, 0xff +; GFX9-NEXT: s_lshl_b32 s46, s62, 8 +; GFX9-NEXT: s_or_b32 s44, s44, s46 +; GFX9-NEXT: s_and_b32 s46, s38, 0xff +; GFX9-NEXT: s_lshl_b32 s47, s30, 8 +; GFX9-NEXT: s_or_b32 s46, s46, s47 +; GFX9-NEXT: s_and_b32 s44, s44, 0xffff +; GFX9-NEXT: s_lshl_b32 s46, s46, 16 +; GFX9-NEXT: s_or_b32 s44, s44, s46 +; GFX9-NEXT: v_mov_b32_e32 v5, s44 +; GFX9-NEXT: s_and_b32 s44, s45, 0xff +; GFX9-NEXT: s_lshl_b32 s45, s99, 8 +; GFX9-NEXT: s_or_b32 s44, s44, s45 +; GFX9-NEXT: s_and_b32 s45, s98, 0xff +; GFX9-NEXT: s_lshl_b32 s46, s61, 8 +; GFX9-NEXT: s_or_b32 s45, s45, s46 +; GFX9-NEXT: s_and_b32 s44, s44, 0xffff +; GFX9-NEXT: s_lshl_b32 s45, s45, 16 +; GFX9-NEXT: s_or_b32 s44, s44, s45 +; GFX9-NEXT: v_mov_b32_e32 v6, s44 +; GFX9-NEXT: s_and_b32 s42, s42, 0xff +; GFX9-NEXT: s_lshl_b32 s44, s60, 8 +; GFX9-NEXT: s_or_b32 s42, s42, s44 +; GFX9-NEXT: s_and_b32 s44, s97, 0xff +; GFX9-NEXT: s_lshl_b32 s45, s94, 8 +; GFX9-NEXT: s_or_b32 s44, s44, s45 +; GFX9-NEXT: s_and_b32 s42, s42, 0xffff +; GFX9-NEXT: s_lshl_b32 s44, s44, 16 +; GFX9-NEXT: s_or_b32 s42, s42, s44 +; GFX9-NEXT: v_mov_b32_e32 v7, s42 +; GFX9-NEXT: s_and_b32 s42, s43, 0xff +; GFX9-NEXT: s_lshl_b32 s43, s96, 8 +; GFX9-NEXT: s_or_b32 s42, s42, s43 +; GFX9-NEXT: s_and_b32 s43, s87, 0xff +; GFX9-NEXT: s_lshl_b32 s44, s59, 8 +; GFX9-NEXT: s_or_b32 s43, s43, s44 +; GFX9-NEXT: s_and_b32 s42, s42, 0xffff +; GFX9-NEXT: s_lshl_b32 s43, s43, 16 +; GFX9-NEXT: s_or_b32 s42, s42, s43 +; GFX9-NEXT: v_mov_b32_e32 v8, s42 +; GFX9-NEXT: s_and_b32 s40, s40, 0xff +; GFX9-NEXT: s_lshl_b32 s42, s58, 8 +; GFX9-NEXT: s_or_b32 s40, s40, s42 +; GFX9-NEXT: s_and_b32 s42, s86, 0xff +; GFX9-NEXT: s_lshl_b32 s43, s92, 8 +; GFX9-NEXT: s_or_b32 s42, s42, s43 +; GFX9-NEXT: s_and_b32 s40, s40, 0xffff +; GFX9-NEXT: s_lshl_b32 s42, s42, 16 +; GFX9-NEXT: s_or_b32 s40, s40, s42 +; GFX9-NEXT: v_mov_b32_e32 v9, s40 +; GFX9-NEXT: s_and_b32 s40, s41, 0xff +; GFX9-NEXT: s_lshl_b32 s41, s85, 8 +; GFX9-NEXT: s_or_b32 s40, s40, s41 +; GFX9-NEXT: s_and_b32 s41, s84, 0xff +; GFX9-NEXT: s_lshl_b32 s29, s29, 8 +; GFX9-NEXT: s_or_b32 s29, s41, s29 +; GFX9-NEXT: s_and_b32 s40, s40, 0xffff +; GFX9-NEXT: s_lshl_b32 s29, s29, 16 +; GFX9-NEXT: s_or_b32 s29, s40, s29 ; GFX9-NEXT: s_and_b32 s24, s24, 0xff -; GFX9-NEXT: s_or_b32 s24, s24, s26 -; GFX9-NEXT: s_lshl_b32 s26, s90, 8 -; GFX9-NEXT: s_and_b32 s27, s85, 0xff -; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: s_lshl_b32 s28, s28, 8 +; GFX9-NEXT: v_mov_b32_e32 v10, s29 +; GFX9-NEXT: s_or_b32 s24, s24, s28 +; GFX9-NEXT: s_and_b32 s28, s83, 0xff +; GFX9-NEXT: s_lshl_b32 s29, s90, 8 +; GFX9-NEXT: s_or_b32 s28, s28, s29 ; GFX9-NEXT: s_and_b32 s24, s24, 0xffff -; GFX9-NEXT: s_lshl_b32 s26, s26, 16 -; GFX9-NEXT: s_or_b32 s24, s24, s26 +; GFX9-NEXT: s_lshl_b32 s28, s28, 16 +; GFX9-NEXT: s_or_b32 s24, s24, s28 ; GFX9-NEXT: v_mov_b32_e32 v11, s24 ; GFX9-NEXT: s_and_b32 s24, s25, 0xff -; GFX9-NEXT: s_lshl_b32 s25, s84, 8 +; GFX9-NEXT: s_lshl_b32 s25, s82, 8 ; GFX9-NEXT: s_or_b32 s24, s24, s25 ; GFX9-NEXT: s_and_b32 s25, s81, 0xff -; GFX9-NEXT: s_lshl_b32 s26, s83, 8 -; GFX9-NEXT: s_or_b32 s25, s25, s26 +; GFX9-NEXT: s_lshl_b32 s27, s27, 8 +; GFX9-NEXT: s_or_b32 s25, s25, s27 ; GFX9-NEXT: s_and_b32 s24, s24, 0xffff ; GFX9-NEXT: s_lshl_b32 s25, s25, 16 ; GFX9-NEXT: s_or_b32 s24, s24, s25 -; GFX9-NEXT: v_mov_b32_e32 v12, s24 -; GFX9-NEXT: s_lshl_b32 s24, s82, 8 -; GFX9-NEXT: s_and_b32 s22, s22, 0xff -; GFX9-NEXT: v_readlane_b32 s25, v22, 50 -; GFX9-NEXT: s_or_b32 s22, s22, s24 -; GFX9-NEXT: s_lshl_b32 s24, s88, 8 -; GFX9-NEXT: s_and_b32 s25, s25, 0xff -; GFX9-NEXT: s_or_b32 s24, s25, s24 -; GFX9-NEXT: s_and_b32 s22, s22, 0xffff -; GFX9-NEXT: s_lshl_b32 s24, s24, 16 -; GFX9-NEXT: s_or_b32 s22, s22, s24 -; GFX9-NEXT: v_mov_b32_e32 v13, s22 -; GFX9-NEXT: s_and_b32 s22, s23, 0xff -; GFX9-NEXT: v_readlane_b32 s23, v22, 49 -; GFX9-NEXT: s_lshl_b32 s23, s23, 8 -; GFX9-NEXT: s_or_b32 s22, s22, s23 -; GFX9-NEXT: v_readlane_b32 s23, v22, 48 -; GFX9-NEXT: v_readlane_b32 s24, v22, 47 -; GFX9-NEXT: s_and_b32 s23, s23, 0xff -; GFX9-NEXT: s_lshl_b32 s24, s24, 8 -; GFX9-NEXT: s_or_b32 s23, s23, s24 -; GFX9-NEXT: s_and_b32 s22, s22, 0xffff -; GFX9-NEXT: s_lshl_b32 s23, s23, 16 -; GFX9-NEXT: s_or_b32 s22, s22, s23 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 @@ -84787,15 +84414,33 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 ; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-NEXT: s_and_b32 s22, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s24, s26, 8 +; GFX9-NEXT: s_or_b32 s22, s22, s24 +; GFX9-NEXT: s_and_b32 s24, s80, 0xff +; GFX9-NEXT: s_lshl_b32 s25, s88, 8 +; GFX9-NEXT: s_or_b32 s24, s24, s25 +; GFX9-NEXT: s_and_b32 s22, s22, 0xffff +; GFX9-NEXT: s_lshl_b32 s24, s24, 16 +; GFX9-NEXT: s_or_b32 s22, s22, s24 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-NEXT: s_and_b32 s22, s23, 0xff +; GFX9-NEXT: s_lshl_b32 s23, s71, 8 +; GFX9-NEXT: s_or_b32 s22, s22, s23 +; GFX9-NEXT: s_and_b32 s23, s70, 0xff +; GFX9-NEXT: s_lshl_b32 s24, s69, 8 +; GFX9-NEXT: s_or_b32 s23, s23, s24 +; GFX9-NEXT: s_and_b32 s22, s22, 0xffff +; GFX9-NEXT: s_lshl_b32 s23, s23, 16 +; GFX9-NEXT: s_or_b32 s22, s22, s23 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 ; GFX9-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-NEXT: v_readlane_b32 s22, v22, 46 ; GFX9-NEXT: s_and_b32 s20, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s22, s22, 8 +; GFX9-NEXT: s_lshl_b32 s22, s68, 8 ; GFX9-NEXT: s_or_b32 s20, s20, s22 -; GFX9-NEXT: v_readlane_b32 s22, v22, 45 -; GFX9-NEXT: s_and_b32 s22, s22, 0xff +; GFX9-NEXT: s_and_b32 s22, s67, 0xff ; GFX9-NEXT: s_lshl_b32 s23, s78, 8 ; GFX9-NEXT: s_or_b32 s22, s22, s23 ; GFX9-NEXT: s_and_b32 s20, s20, 0xffff @@ -84804,26 +84449,23 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: v_mov_b32_e32 v1, s20 ; GFX9-NEXT: s_and_b32 s20, s21, 0xff -; GFX9-NEXT: v_readlane_b32 s21, v22, 44 -; GFX9-NEXT: s_lshl_b32 s21, s21, 8 +; GFX9-NEXT: s_lshl_b32 s21, s66, 8 ; GFX9-NEXT: s_or_b32 s20, s20, s21 -; GFX9-NEXT: v_readlane_b32 s21, v22, 43 -; GFX9-NEXT: v_readlane_b32 s22, v22, 42 -; GFX9-NEXT: s_and_b32 s21, s21, 0xff -; GFX9-NEXT: s_lshl_b32 s22, s22, 8 +; GFX9-NEXT: s_and_b32 s21, s65, 0xff +; GFX9-NEXT: s_lshl_b32 s22, s64, 8 ; GFX9-NEXT: s_or_b32 s21, s21, s22 ; GFX9-NEXT: s_and_b32 s20, s20, 0xffff ; GFX9-NEXT: s_lshl_b32 s21, s21, 16 ; GFX9-NEXT: s_or_b32 s20, s20, s21 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: v_mov_b32_e32 v1, s20 -; GFX9-NEXT: v_readlane_b32 s20, v22, 41 +; GFX9-NEXT: v_readlane_b32 s20, v22, 42 ; GFX9-NEXT: s_and_b32 s18, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s20, s20, 8 +; GFX9-NEXT: v_readlane_b32 s22, v22, 0 ; GFX9-NEXT: s_or_b32 s18, s18, s20 -; GFX9-NEXT: v_readlane_b32 s20, v22, 40 -; GFX9-NEXT: s_and_b32 s20, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s21, s76, 8 +; GFX9-NEXT: s_and_b32 s20, s53, 0xff +; GFX9-NEXT: s_lshl_b32 s21, s22, 8 ; GFX9-NEXT: s_or_b32 s20, s20, s21 ; GFX9-NEXT: s_and_b32 s18, s18, 0xffff ; GFX9-NEXT: s_lshl_b32 s20, s20, 16 @@ -84831,11 +84473,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: s_and_b32 s18, s19, 0xff -; GFX9-NEXT: v_readlane_b32 s19, v22, 39 +; GFX9-NEXT: v_readlane_b32 s19, v22, 41 ; GFX9-NEXT: s_lshl_b32 s19, s19, 8 ; GFX9-NEXT: s_or_b32 s18, s18, s19 -; GFX9-NEXT: v_readlane_b32 s19, v22, 38 -; GFX9-NEXT: v_readlane_b32 s20, v22, 37 +; GFX9-NEXT: v_readlane_b32 s19, v22, 40 +; GFX9-NEXT: v_readlane_b32 s20, v22, 39 ; GFX9-NEXT: s_and_b32 s19, s19, 0xff ; GFX9-NEXT: s_lshl_b32 s20, s20, 8 ; GFX9-NEXT: s_or_b32 s19, s19, s20 @@ -84844,13 +84486,14 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s18, s18, s19 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_readlane_b32 s18, v22, 36 +; GFX9-NEXT: v_readlane_b32 s18, v22, 38 ; GFX9-NEXT: s_and_b32 s16, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s18, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s18 -; GFX9-NEXT: v_readlane_b32 s18, v22, 35 +; GFX9-NEXT: v_readlane_b32 s18, v22, 37 +; GFX9-NEXT: v_readlane_b32 s20, v22, 2 ; GFX9-NEXT: s_and_b32 s18, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s19, s74, 8 +; GFX9-NEXT: s_lshl_b32 s19, s20, 8 ; GFX9-NEXT: s_or_b32 s18, s18, s19 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s18, s18, 16 @@ -84858,11 +84501,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s16, s17, 0xff -; GFX9-NEXT: v_readlane_b32 s17, v22, 34 +; GFX9-NEXT: v_readlane_b32 s17, v22, 36 ; GFX9-NEXT: s_lshl_b32 s17, s17, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v22, 33 -; GFX9-NEXT: v_readlane_b32 s18, v22, 32 +; GFX9-NEXT: v_readlane_b32 s17, v22, 35 +; GFX9-NEXT: v_readlane_b32 s18, v22, 34 ; GFX9-NEXT: s_and_b32 s17, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s18, s18, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 @@ -84871,13 +84514,14 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: v_readlane_b32 s16, v22, 31 +; GFX9-NEXT: v_readlane_b32 s16, v22, 33 ; GFX9-NEXT: s_and_b32 s14, s14, 0xff ; GFX9-NEXT: s_lshl_b32 s16, s16, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s16 -; GFX9-NEXT: v_readlane_b32 s16, v22, 30 +; GFX9-NEXT: v_readlane_b32 s16, v22, 32 +; GFX9-NEXT: v_readlane_b32 s18, v22, 4 ; GFX9-NEXT: s_and_b32 s16, s16, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s72, 8 +; GFX9-NEXT: s_lshl_b32 s17, s18, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: s_and_b32 s14, s14, 0xffff ; GFX9-NEXT: s_lshl_b32 s16, s16, 16 @@ -84885,11 +84529,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-NEXT: s_and_b32 s14, s15, 0xff -; GFX9-NEXT: v_readlane_b32 s15, v22, 29 +; GFX9-NEXT: v_readlane_b32 s15, v22, 31 ; GFX9-NEXT: s_lshl_b32 s15, s15, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s15 -; GFX9-NEXT: v_readlane_b32 s15, v22, 28 -; GFX9-NEXT: v_readlane_b32 s16, v22, 27 +; GFX9-NEXT: v_readlane_b32 s15, v22, 30 +; GFX9-NEXT: v_readlane_b32 s16, v22, 29 ; GFX9-NEXT: s_and_b32 s15, s15, 0xff ; GFX9-NEXT: s_lshl_b32 s16, s16, 8 ; GFX9-NEXT: s_or_b32 s15, s15, s16 @@ -84898,13 +84542,14 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s14, s14, s15 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_readlane_b32 s14, v22, 26 +; GFX9-NEXT: v_readlane_b32 s14, v22, 28 ; GFX9-NEXT: s_and_b32 s12, s12, 0xff ; GFX9-NEXT: s_lshl_b32 s14, s14, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s14 -; GFX9-NEXT: v_readlane_b32 s14, v22, 25 +; GFX9-NEXT: v_readlane_b32 s14, v22, 27 +; GFX9-NEXT: v_readlane_b32 s16, v22, 6 ; GFX9-NEXT: s_and_b32 s14, s14, 0xff -; GFX9-NEXT: s_lshl_b32 s15, s62, 8 +; GFX9-NEXT: s_lshl_b32 s15, s16, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s15 ; GFX9-NEXT: s_and_b32 s12, s12, 0xffff ; GFX9-NEXT: s_lshl_b32 s14, s14, 16 @@ -84912,11 +84557,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-NEXT: s_and_b32 s12, s13, 0xff -; GFX9-NEXT: v_readlane_b32 s13, v22, 24 +; GFX9-NEXT: v_readlane_b32 s13, v22, 26 ; GFX9-NEXT: s_lshl_b32 s13, s13, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s13 -; GFX9-NEXT: v_readlane_b32 s13, v22, 23 -; GFX9-NEXT: v_readlane_b32 s14, v22, 22 +; GFX9-NEXT: v_readlane_b32 s13, v22, 25 +; GFX9-NEXT: v_readlane_b32 s14, v22, 24 ; GFX9-NEXT: s_and_b32 s13, s13, 0xff ; GFX9-NEXT: s_lshl_b32 s14, s14, 8 ; GFX9-NEXT: s_or_b32 s13, s13, s14 @@ -84925,13 +84570,14 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_readlane_b32 s12, v22, 21 +; GFX9-NEXT: v_readlane_b32 s12, v22, 23 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshl_b32 s12, s12, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s12 -; GFX9-NEXT: v_readlane_b32 s12, v22, 20 +; GFX9-NEXT: v_readlane_b32 s12, v22, 22 +; GFX9-NEXT: v_readlane_b32 s14, v22, 8 ; GFX9-NEXT: s_and_b32 s12, s12, 0xff -; GFX9-NEXT: s_lshl_b32 s13, s60, 8 +; GFX9-NEXT: s_lshl_b32 s13, s14, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: s_and_b32 s10, s10, 0xffff ; GFX9-NEXT: s_lshl_b32 s12, s12, 16 @@ -84939,11 +84585,11 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: s_and_b32 s10, s11, 0xff -; GFX9-NEXT: v_readlane_b32 s11, v22, 19 +; GFX9-NEXT: v_readlane_b32 s11, v22, 21 ; GFX9-NEXT: s_lshl_b32 s11, s11, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s11 -; GFX9-NEXT: v_readlane_b32 s11, v22, 18 -; GFX9-NEXT: v_readlane_b32 s12, v22, 17 +; GFX9-NEXT: v_readlane_b32 s11, v22, 20 +; GFX9-NEXT: v_readlane_b32 s12, v22, 19 ; GFX9-NEXT: s_and_b32 s11, s11, 0xff ; GFX9-NEXT: s_lshl_b32 s12, s12, 8 ; GFX9-NEXT: s_or_b32 s11, s11, s12 @@ -84952,13 +84598,14 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: v_readlane_b32 s10, v22, 16 +; GFX9-NEXT: v_readlane_b32 s10, v22, 18 ; GFX9-NEXT: s_and_b32 s8, s8, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s10 -; GFX9-NEXT: v_readlane_b32 s10, v22, 15 +; GFX9-NEXT: v_readlane_b32 s10, v22, 17 +; GFX9-NEXT: v_readlane_b32 s12, v22, 10 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s58, 8 +; GFX9-NEXT: s_lshl_b32 s11, s12, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 @@ -84966,26 +84613,22 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: s_and_b32 s8, s9, 0xff -; GFX9-NEXT: v_readlane_b32 s9, v22, 14 -; GFX9-NEXT: s_lshl_b32 s9, s9, 8 +; GFX9-NEXT: s_lshl_b32 s9, s54, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 -; GFX9-NEXT: v_readlane_b32 s9, v22, 13 -; GFX9-NEXT: v_readlane_b32 s10, v22, 12 -; GFX9-NEXT: s_and_b32 s9, s9, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_and_b32 s9, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s55, 8 ; GFX9-NEXT: s_or_b32 s9, s9, s10 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff ; GFX9-NEXT: s_lshl_b32 s9, s9, 16 ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_readlane_b32 s8, v22, 11 ; GFX9-NEXT: s_and_b32 s6, s6, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_lshl_b32 s8, s36, 8 +; GFX9-NEXT: v_readlane_b32 s10, v22, 12 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_readlane_b32 s8, v22, 10 -; GFX9-NEXT: s_and_b32 s8, s8, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s28, 8 +; GFX9-NEXT: s_and_b32 s8, s37, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s10, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 @@ -84993,26 +84636,21 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_and_b32 s6, s7, 0xff -; GFX9-NEXT: v_readlane_b32 s7, v22, 9 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_lshl_b32 s7, s35, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_readlane_b32 s7, v22, 8 -; GFX9-NEXT: v_readlane_b32 s8, v22, 7 -; GFX9-NEXT: s_and_b32 s7, s7, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_and_b32 s7, s31, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s95, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_readlane_b32 s6, v22, 6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshl_b32 s6, s93, 8 +; GFX9-NEXT: v_readlane_b32 s8, v22, 14 ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: v_readlane_b32 s6, v22, 5 -; GFX9-NEXT: v_readlane_b32 s8, v22, 0 -; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_and_b32 s6, s91, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s8, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -85021,12 +84659,10 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_and_b32 s4, s5, 0xff -; GFX9-NEXT: v_readlane_b32 s5, v22, 4 -; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_lshl_b32 s5, s89, 8 +; GFX9-NEXT: v_readlane_b32 s6, v22, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_readlane_b32 s5, v22, 3 -; GFX9-NEXT: v_readlane_b32 s6, v22, 2 -; GFX9-NEXT: s_and_b32 s5, s5, 0xff +; GFX9-NEXT: s_and_b32 s5, s79, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff @@ -85034,7 +84670,14 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_readlane_b32 s9, v22, 1 +; GFX9-NEXT: v_readlane_b32 s23, v22, 1 +; GFX9-NEXT: v_readlane_b32 s21, v22, 3 +; GFX9-NEXT: v_readlane_b32 s19, v22, 5 +; GFX9-NEXT: v_readlane_b32 s17, v22, 7 +; GFX9-NEXT: v_readlane_b32 s15, v22, 9 +; GFX9-NEXT: v_readlane_b32 s13, v22, 11 +; GFX9-NEXT: v_readlane_b32 s11, v22, 13 +; GFX9-NEXT: v_readlane_b32 s9, v22, 15 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: v_readlane_b32 s99, v21, 35 ; GFX9-NEXT: v_readlane_b32 s98, v21, 34 @@ -85079,153 +84722,145 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB57_4: -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: v_writelane_b32 v22, s82, 0 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr80 -; GFX9-NEXT: ; implicit-def: $sgpr71 -; GFX9-NEXT: ; implicit-def: $sgpr70 -; GFX9-NEXT: ; implicit-def: $sgpr69 -; GFX9-NEXT: ; implicit-def: $sgpr68 -; GFX9-NEXT: ; implicit-def: $sgpr67 -; GFX9-NEXT: ; implicit-def: $sgpr66 -; GFX9-NEXT: ; implicit-def: $sgpr65 -; GFX9-NEXT: ; implicit-def: $sgpr64 -; GFX9-NEXT: ; implicit-def: $sgpr55 -; GFX9-NEXT: ; implicit-def: $sgpr54 -; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 0 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 1 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 2 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 3 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 4 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 5 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 6 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 7 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 8 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 9 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 10 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 11 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 12 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 13 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: v_writelane_b32 v22, vcc_lo, 14 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: v_writelane_b32 v22, vcc_hi, 15 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr74 ; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr76 ; GFX9-NEXT: ; implicit-def: $sgpr51 ; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr72 ; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr48 ; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr99 ; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr94 ; GFX9-NEXT: ; implicit-def: $sgpr96 ; GFX9-NEXT: ; implicit-def: $sgpr87 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 ; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr85 ; GFX9-NEXT: ; implicit-def: $sgpr84 -; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr29 +; GFX9-NEXT: ; implicit-def: $sgpr28 ; GFX9-NEXT: ; implicit-def: $sgpr83 -; GFX9-NEXT: ; implicit-def: $sgpr36 -; GFX9-NEXT: ; implicit-def: $sgpr34 -; GFX9-NEXT: ; implicit-def: $sgpr30 -; GFX9-NEXT: ; implicit-def: $sgpr94 -; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr90 -; GFX9-NEXT: ; implicit-def: $sgpr88 -; GFX9-NEXT: ; implicit-def: $sgpr78 -; GFX9-NEXT: ; implicit-def: $sgpr76 -; GFX9-NEXT: ; implicit-def: $sgpr74 -; GFX9-NEXT: ; implicit-def: $sgpr72 -; GFX9-NEXT: ; implicit-def: $sgpr62 -; GFX9-NEXT: ; implicit-def: $sgpr60 -; GFX9-NEXT: ; implicit-def: $sgpr58 -; GFX9-NEXT: ; implicit-def: $sgpr28 -; GFX9-NEXT: v_writelane_b32 v22, s83, 1 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 ; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr81 ; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr27 -; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr55 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr31 +; GFX9-NEXT: ; implicit-def: $sgpr95 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr91 +; GFX9-NEXT: ; implicit-def: $sgpr89 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; kill: killed $sgpr75 ; GFX9-NEXT: s_branch .LBB57_2 ; ; GFX11-LABEL: bitcast_v16i64_to_v128i8_scalar: @@ -85260,18 +84895,18 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 ; GFX11-NEXT: v_writelane_b32 v34, s37, 5 ; GFX11-NEXT: v_writelane_b32 v35, s101, 5 -; GFX11-NEXT: v_readfirstlane_b32 s40, v16 -; GFX11-NEXT: v_readfirstlane_b32 s41, v17 -; GFX11-NEXT: v_readfirstlane_b32 s28, v18 +; GFX11-NEXT: v_readfirstlane_b32 s56, v16 +; GFX11-NEXT: v_readfirstlane_b32 s57, v17 +; GFX11-NEXT: v_readfirstlane_b32 s46, v18 ; GFX11-NEXT: v_writelane_b32 v34, s38, 6 ; GFX11-NEXT: v_writelane_b32 v35, s102, 6 -; GFX11-NEXT: v_readfirstlane_b32 s29, v19 -; GFX11-NEXT: v_readfirstlane_b32 s26, v20 -; GFX11-NEXT: v_readfirstlane_b32 s27, v21 +; GFX11-NEXT: v_readfirstlane_b32 s47, v19 +; GFX11-NEXT: v_readfirstlane_b32 s40, v20 +; GFX11-NEXT: v_readfirstlane_b32 s41, v21 ; GFX11-NEXT: v_writelane_b32 v34, s39, 7 ; GFX11-NEXT: v_writelane_b32 v35, s103, 7 -; GFX11-NEXT: v_readfirstlane_b32 s24, v22 -; GFX11-NEXT: v_readfirstlane_b32 s25, v23 +; GFX11-NEXT: v_readfirstlane_b32 s28, v22 +; GFX11-NEXT: v_readfirstlane_b32 s29, v23 ; GFX11-NEXT: v_readfirstlane_b32 s22, v24 ; GFX11-NEXT: v_writelane_b32 v34, s48, 8 ; GFX11-NEXT: v_readfirstlane_b32 s23, v25 @@ -85302,9 +84937,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: v_readfirstlane_b32 s3, v12 ; GFX11-NEXT: v_readfirstlane_b32 s0, v13 ; GFX11-NEXT: v_readfirstlane_b32 s1, v14 -; GFX11-NEXT: s_mov_b32 s101, 0 +; GFX11-NEXT: s_and_b32 s24, vcc_lo, exec_lo ; GFX11-NEXT: v_writelane_b32 v34, s54, 14 -; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo ; GFX11-NEXT: v_writelane_b32 v35, s104, 8 ; GFX11-NEXT: ; implicit-def: $vgpr37 : SGPR spill to VGPR lane ; GFX11-NEXT: ; implicit-def: $vgpr36 : SGPR spill to VGPR lane @@ -85325,305 +84959,302 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: v_writelane_b32 v34, s85, 29 ; GFX11-NEXT: v_writelane_b32 v34, s86, 30 ; GFX11-NEXT: v_writelane_b32 v34, s87, 31 +; GFX11-NEXT: s_mov_b32 s87, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB57_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s43, s19, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v37, s43, 16 -; GFX11-NEXT: s_lshr_b32 s43, s18, 16 -; GFX11-NEXT: s_lshr_b32 s104, s1, 24 -; GFX11-NEXT: s_lshr_b32 s102, s1, 16 -; GFX11-NEXT: s_lshr_b32 s103, s1, 8 -; GFX11-NEXT: v_writelane_b32 v37, s43, 17 -; GFX11-NEXT: s_lshr_b32 s43, s18, 8 -; GFX11-NEXT: s_lshr_b32 s57, s0, 16 -; GFX11-NEXT: s_lshr_b32 s47, s0, 8 -; GFX11-NEXT: s_lshr_b32 s46, s3, 24 -; GFX11-NEXT: v_writelane_b32 v37, s43, 18 -; GFX11-NEXT: s_lshr_b32 s43, s21, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s3, 16 -; GFX11-NEXT: s_lshr_b32 s34, s3, 8 -; GFX11-NEXT: s_lshr_b32 s69, s2, 16 -; GFX11-NEXT: v_writelane_b32 v37, s43, 19 -; GFX11-NEXT: s_lshr_b32 s43, s21, 16 -; GFX11-NEXT: s_lshr_b32 s56, s2, 8 -; GFX11-NEXT: s_lshr_b32 s35, s5, 24 -; GFX11-NEXT: s_lshr_b32 s36, s5, 16 -; GFX11-NEXT: v_writelane_b32 v37, s43, 20 -; GFX11-NEXT: s_lshr_b32 s43, s21, 8 -; GFX11-NEXT: s_lshr_b32 s37, s5, 8 -; GFX11-NEXT: s_lshr_b32 s38, s4, 16 -; GFX11-NEXT: s_lshr_b32 s39, s4, 8 -; GFX11-NEXT: v_writelane_b32 v37, s43, 21 -; GFX11-NEXT: s_lshr_b32 s43, s20, 16 -; GFX11-NEXT: s_lshr_b32 s48, s7, 24 -; GFX11-NEXT: s_lshr_b32 s49, s7, 16 -; GFX11-NEXT: s_lshr_b32 s50, s7, 8 -; GFX11-NEXT: v_writelane_b32 v37, s43, 22 -; GFX11-NEXT: s_lshr_b32 s43, s20, 8 -; GFX11-NEXT: s_lshr_b32 s51, s6, 16 -; GFX11-NEXT: s_lshr_b32 s52, s6, 8 -; GFX11-NEXT: s_lshr_b32 s53, s9, 24 -; GFX11-NEXT: v_writelane_b32 v37, s43, 23 -; GFX11-NEXT: s_lshr_b32 s43, s23, 24 -; GFX11-NEXT: s_lshr_b32 s54, s9, 16 -; GFX11-NEXT: s_lshr_b32 s55, s9, 8 -; GFX11-NEXT: s_lshr_b32 s64, s8, 16 -; GFX11-NEXT: v_writelane_b32 v37, s43, 24 -; GFX11-NEXT: s_lshr_b32 s43, s23, 16 -; GFX11-NEXT: s_lshr_b32 s65, s8, 8 -; GFX11-NEXT: s_lshr_b32 s66, s11, 24 -; GFX11-NEXT: s_lshr_b32 s67, s11, 16 -; GFX11-NEXT: v_writelane_b32 v37, s43, 25 -; GFX11-NEXT: s_lshr_b32 s43, s23, 8 -; GFX11-NEXT: s_lshr_b32 s68, s11, 8 -; GFX11-NEXT: s_lshr_b32 s59, s10, 16 -; GFX11-NEXT: s_lshr_b32 s58, s10, 8 -; GFX11-NEXT: v_writelane_b32 v37, s43, 26 -; GFX11-NEXT: s_lshr_b32 s43, s22, 16 -; GFX11-NEXT: s_lshr_b32 s70, s13, 24 -; GFX11-NEXT: s_lshr_b32 s71, s13, 16 -; GFX11-NEXT: s_lshr_b32 s60, s13, 8 -; GFX11-NEXT: v_writelane_b32 v37, s43, 27 -; GFX11-NEXT: s_lshr_b32 s43, s22, 8 -; GFX11-NEXT: s_lshr_b32 s80, s12, 16 -; GFX11-NEXT: s_lshr_b32 s61, s12, 8 -; GFX11-NEXT: s_lshr_b32 s81, s15, 24 -; GFX11-NEXT: v_writelane_b32 v37, s43, 28 -; GFX11-NEXT: s_lshr_b32 s43, s25, 24 -; GFX11-NEXT: s_lshr_b32 s82, s15, 16 -; GFX11-NEXT: s_lshr_b32 s83, s15, 8 -; GFX11-NEXT: s_lshr_b32 s84, s14, 16 -; GFX11-NEXT: v_writelane_b32 v37, s43, 29 -; GFX11-NEXT: s_lshr_b32 s43, s25, 16 -; GFX11-NEXT: s_lshr_b32 s85, s14, 8 -; GFX11-NEXT: s_lshr_b32 s86, s17, 24 -; GFX11-NEXT: s_lshr_b32 s72, s17, 16 -; GFX11-NEXT: v_writelane_b32 v37, s43, 30 -; GFX11-NEXT: s_lshr_b32 s43, s25, 8 -; GFX11-NEXT: s_lshr_b32 s87, s17, 8 -; GFX11-NEXT: s_lshr_b32 s73, s16, 16 -; GFX11-NEXT: s_lshr_b32 s96, s16, 8 -; GFX11-NEXT: v_writelane_b32 v37, s43, 31 -; GFX11-NEXT: s_lshr_b32 s43, s24, 16 -; GFX11-NEXT: s_lshr_b32 s97, s19, 24 -; GFX11-NEXT: v_writelane_b32 v36, s43, 0 -; GFX11-NEXT: s_lshr_b32 s43, s24, 8 -; GFX11-NEXT: v_writelane_b32 v37, s62, 14 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s74, s28, 16 -; GFX11-NEXT: v_writelane_b32 v36, s43, 1 -; GFX11-NEXT: s_lshr_b32 s43, s27, 24 -; GFX11-NEXT: v_writelane_b32 v37, s63, 15 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[2:3], 24 -; GFX11-NEXT: s_lshr_b32 s98, s41, 24 -; GFX11-NEXT: v_writelane_b32 v36, s43, 2 -; GFX11-NEXT: s_lshr_b32 s43, s27, 16 -; GFX11-NEXT: v_writelane_b32 v37, s62, 12 -; GFX11-NEXT: s_lshr_b32 s99, s41, 16 -; GFX11-NEXT: s_lshr_b32 s100, s41, 8 -; GFX11-NEXT: v_writelane_b32 v36, s43, 3 -; GFX11-NEXT: s_lshr_b32 s43, s27, 8 -; GFX11-NEXT: v_writelane_b32 v37, s63, 13 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; GFX11-NEXT: s_lshr_b32 s44, s40, 16 -; GFX11-NEXT: v_writelane_b32 v36, s43, 4 -; GFX11-NEXT: s_lshr_b32 s43, s26, 16 -; GFX11-NEXT: v_writelane_b32 v37, s62, 10 -; GFX11-NEXT: s_lshr_b32 s45, s40, 8 +; GFX11-NEXT: s_lshr_b32 s24, s1, 24 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[0:1], 24 +; GFX11-NEXT: v_writelane_b32 v37, s24, 28 +; GFX11-NEXT: s_lshr_b32 s24, s1, 16 +; GFX11-NEXT: s_lshr_b32 s26, s22, 8 +; GFX11-NEXT: s_lshr_b32 vcc_lo, s2, 8 +; GFX11-NEXT: s_lshr_b32 s79, s9, 8 +; GFX11-NEXT: v_writelane_b32 v37, s24, 29 +; GFX11-NEXT: s_lshr_b32 s24, s1, 8 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[46:47], 24 +; GFX11-NEXT: s_lshr_b32 s31, s5, 24 +; GFX11-NEXT: s_lshr_b32 s97, s5, 16 +; GFX11-NEXT: v_writelane_b32 v37, s24, 30 +; GFX11-NEXT: s_lshr_b32 s24, s0, 16 +; GFX11-NEXT: s_lshr_b32 s61, s5, 8 +; GFX11-NEXT: s_lshr_b32 s99, s4, 16 +; GFX11-NEXT: s_lshr_b32 s101, s4, 8 +; GFX11-NEXT: v_writelane_b32 v37, s24, 31 +; GFX11-NEXT: s_lshr_b32 s24, s0, 8 +; GFX11-NEXT: s_lshr_b32 s63, s7, 24 +; GFX11-NEXT: v_writelane_b32 v36, s24, 0 +; GFX11-NEXT: s_lshr_b32 s24, s3, 24 +; GFX11-NEXT: v_writelane_b32 v37, s76, 26 +; GFX11-NEXT: s_lshr_b32 s103, s7, 16 +; GFX11-NEXT: s_lshr_b32 s73, s7, 8 +; GFX11-NEXT: v_writelane_b32 v36, s24, 1 +; GFX11-NEXT: s_lshr_b32 s24, s3, 16 +; GFX11-NEXT: v_writelane_b32 v37, s77, 27 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[2:3], 24 +; GFX11-NEXT: s_lshr_b32 s35, s6, 16 +; GFX11-NEXT: v_writelane_b32 v36, s24, 2 +; GFX11-NEXT: s_lshr_b32 s24, s3, 8 +; GFX11-NEXT: v_writelane_b32 v37, s76, 24 +; GFX11-NEXT: s_lshr_b32 s75, s6, 8 +; GFX11-NEXT: s_lshr_b32 s37, s9, 24 +; GFX11-NEXT: v_writelane_b32 v36, s24, 3 +; GFX11-NEXT: s_lshr_b32 s24, s2, 16 +; GFX11-NEXT: v_writelane_b32 v37, s77, 25 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[4:5], 24 +; GFX11-NEXT: s_lshr_b32 s39, s9, 16 +; GFX11-NEXT: v_writelane_b32 v36, s24, 4 +; GFX11-NEXT: s_lshr_b32 s38, s8, 16 +; GFX11-NEXT: v_writelane_b32 v37, s76, 22 +; GFX11-NEXT: s_lshr_b32 s96, s8, 8 +; GFX11-NEXT: s_lshr_b32 s98, s11, 24 +; GFX11-NEXT: v_writelane_b32 v36, s26, 5 +; GFX11-NEXT: s_lshr_b32 s26, s29, 24 +; GFX11-NEXT: v_writelane_b32 v37, s77, 23 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-NEXT: s_lshr_b32 s30, s11, 16 +; GFX11-NEXT: v_writelane_b32 v36, s26, 6 +; GFX11-NEXT: s_lshr_b32 s26, s29, 16 +; GFX11-NEXT: v_writelane_b32 v37, s76, 20 +; GFX11-NEXT: s_lshr_b32 s62, s11, 8 +; GFX11-NEXT: s_lshr_b32 s100, s10, 16 +; GFX11-NEXT: s_lshr_b32 s104, s10, 8 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s13, 24 +; GFX11-NEXT: v_writelane_b32 v37, s77, 21 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[8:9], 24 +; GFX11-NEXT: s_lshr_b32 s72, s13, 16 +; GFX11-NEXT: s_lshr_b32 s102, s13, 8 +; GFX11-NEXT: s_lshr_b32 s74, s12, 16 +; GFX11-NEXT: v_writelane_b32 v37, s76, 18 +; GFX11-NEXT: s_lshr_b32 s34, s12, 8 +; GFX11-NEXT: s_lshr_b32 s36, s15, 24 +; GFX11-NEXT: s_lshr_b32 s60, s15, 16 +; GFX11-NEXT: s_lshr_b32 s48, s15, 8 +; GFX11-NEXT: v_writelane_b32 v37, s77, 19 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[10:11], 24 +; GFX11-NEXT: s_lshr_b32 s49, s14, 16 +; GFX11-NEXT: s_lshr_b32 s50, s14, 8 +; GFX11-NEXT: s_lshr_b32 s78, s17, 24 +; GFX11-NEXT: v_writelane_b32 v37, s76, 16 +; GFX11-NEXT: s_lshr_b32 s51, s17, 16 +; GFX11-NEXT: s_lshr_b32 s52, s17, 8 +; GFX11-NEXT: s_lshr_b32 s53, s16, 16 +; GFX11-NEXT: s_lshr_b32 s88, s16, 8 +; GFX11-NEXT: v_writelane_b32 v37, s77, 17 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[12:13], 24 +; GFX11-NEXT: s_lshr_b32 s89, s19, 24 +; GFX11-NEXT: s_lshr_b32 s54, s19, 16 +; GFX11-NEXT: s_lshr_b32 s55, s19, 8 +; GFX11-NEXT: v_writelane_b32 v37, s76, 14 +; GFX11-NEXT: s_lshr_b32 s64, s18, 16 +; GFX11-NEXT: s_lshr_b32 s65, s18, 8 +; GFX11-NEXT: s_lshr_b32 s90, s21, 24 +; GFX11-NEXT: s_lshr_b32 s91, s21, 16 +; GFX11-NEXT: v_writelane_b32 v37, s77, 15 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[14:15], 24 +; GFX11-NEXT: s_lshr_b32 s66, s21, 8 +; GFX11-NEXT: s_lshr_b32 s67, s20, 16 +; GFX11-NEXT: s_lshr_b32 s68, s20, 8 +; GFX11-NEXT: v_writelane_b32 v37, s76, 12 +; GFX11-NEXT: s_lshr_b32 s92, s23, 24 +; GFX11-NEXT: s_lshr_b32 s93, s23, 16 +; GFX11-NEXT: s_lshr_b32 s24, s23, 8 +; GFX11-NEXT: s_lshr_b32 s25, s22, 16 +; GFX11-NEXT: v_writelane_b32 v37, s77, 13 ; GFX11-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 -; GFX11-NEXT: v_writelane_b32 v36, s43, 5 -; GFX11-NEXT: s_lshr_b32 s43, s26, 8 -; GFX11-NEXT: v_writelane_b32 v37, s63, 11 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[18:19], 24 -; GFX11-NEXT: v_writelane_b32 v36, s43, 6 -; GFX11-NEXT: s_lshr_b32 s43, s29, 24 -; GFX11-NEXT: v_writelane_b32 v37, s62, 8 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[22:23], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; GFX11-NEXT: v_writelane_b32 v36, s43, 7 -; GFX11-NEXT: s_lshr_b32 s43, s29, 16 -; GFX11-NEXT: v_writelane_b32 v37, s63, 9 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[26:27], 24 -; GFX11-NEXT: v_writelane_b32 v36, s43, 8 -; GFX11-NEXT: s_lshr_b32 s43, s29, 8 -; GFX11-NEXT: v_writelane_b32 v37, s62, 6 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[28:29], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v36, s43, 9 -; GFX11-NEXT: s_lshr_b32 s43, s28, 8 -; GFX11-NEXT: v_writelane_b32 v37, s63, 7 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX11-NEXT: v_writelane_b32 v36, s26, 7 +; GFX11-NEXT: s_lshr_b32 s26, s29, 8 +; GFX11-NEXT: s_lshr_b32 s27, s28, 16 +; GFX11-NEXT: v_writelane_b32 v37, s76, 10 +; GFX11-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-NEXT: s_lshr_b32 s43, s41, 24 +; GFX11-NEXT: s_lshr_b32 s44, s41, 16 +; GFX11-NEXT: s_lshr_b32 s45, s41, 8 +; GFX11-NEXT: v_writelane_b32 v37, s77, 11 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; GFX11-NEXT: s_lshr_b32 s69, s40, 16 +; GFX11-NEXT: s_lshr_b32 s70, s40, 8 +; GFX11-NEXT: s_lshr_b32 s71, s47, 24 +; GFX11-NEXT: v_writelane_b32 v37, s76, 8 +; GFX11-NEXT: s_lshr_b32 s80, s47, 16 +; GFX11-NEXT: s_lshr_b32 s81, s47, 8 +; GFX11-NEXT: s_lshr_b32 s82, s46, 16 +; GFX11-NEXT: s_lshr_b32 s83, s46, 8 +; GFX11-NEXT: v_writelane_b32 v37, s77, 9 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[20:21], 24 +; GFX11-NEXT: s_lshr_b32 s84, s57, 24 +; GFX11-NEXT: s_lshr_b32 s85, s57, 16 +; GFX11-NEXT: s_lshr_b32 s86, s57, 8 +; GFX11-NEXT: v_writelane_b32 v37, s76, 6 +; GFX11-NEXT: s_lshr_b32 s58, s56, 16 +; GFX11-NEXT: s_lshr_b32 s59, s56, 8 +; GFX11-NEXT: s_mov_b32 s95, vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, s77, 7 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[22:23], 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v37, s62, 4 -; GFX11-NEXT: v_writelane_b32 v37, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX11-NEXT: v_writelane_b32 v37, s62, 2 -; GFX11-NEXT: v_writelane_b32 v37, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 +; GFX11-NEXT: v_writelane_b32 v37, s76, 4 +; GFX11-NEXT: v_writelane_b32 v37, s77, 5 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[28:29], 24 +; GFX11-NEXT: v_writelane_b32 v37, s76, 2 +; GFX11-NEXT: v_writelane_b32 v37, s77, 3 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[40:41], 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v37, s62, 0 -; GFX11-NEXT: v_writelane_b32 v37, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 +; GFX11-NEXT: v_writelane_b32 v37, s76, 0 +; GFX11-NEXT: v_writelane_b32 v37, s77, 1 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[56:57], 24 +; GFX11-NEXT: s_mov_b32 s77, s79 ; GFX11-NEXT: s_branch .LBB57_3 ; GFX11-NEXT: .LBB57_2: +; GFX11-NEXT: ; implicit-def: $sgpr24 +; GFX11-NEXT: v_writelane_b32 v37, s24, 0 +; GFX11-NEXT: ; implicit-def: $sgpr24 +; GFX11-NEXT: ; implicit-def: $sgpr60 +; GFX11-NEXT: ; implicit-def: $sgpr62 ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 0 ; GFX11-NEXT: ; implicit-def: $vcc_hi -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 1 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 2 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 3 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 4 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: s_mov_b32 s101, -1 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr45 -; GFX11-NEXT: ; implicit-def: $sgpr44 -; GFX11-NEXT: ; implicit-def: $sgpr30 -; GFX11-NEXT: ; implicit-def: $sgpr100 -; GFX11-NEXT: ; implicit-def: $sgpr99 -; GFX11-NEXT: ; implicit-def: $sgpr98 -; GFX11-NEXT: ; implicit-def: $sgpr43 -; GFX11-NEXT: ; implicit-def: $sgpr74 -; GFX11-NEXT: ; implicit-def: $sgpr94 -; GFX11-NEXT: ; implicit-def: $sgpr92 -; GFX11-NEXT: ; implicit-def: $sgpr90 -; GFX11-NEXT: ; implicit-def: $sgpr78 -; GFX11-NEXT: ; implicit-def: $sgpr62 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr97 -; GFX11-NEXT: ; implicit-def: $sgpr96 -; GFX11-NEXT: ; implicit-def: $sgpr73 -; GFX11-NEXT: ; implicit-def: $sgpr87 -; GFX11-NEXT: ; implicit-def: $sgpr72 +; GFX11-NEXT: ; implicit-def: $sgpr79 +; GFX11-NEXT: s_mov_b32 s87, -1 +; GFX11-NEXT: ; kill: killed $sgpr79 +; GFX11-NEXT: ; implicit-def: $sgpr79 +; GFX11-NEXT: ; kill: killed $sgpr79 +; GFX11-NEXT: ; implicit-def: $sgpr79 +; GFX11-NEXT: ; implicit-def: $sgpr59 +; GFX11-NEXT: ; implicit-def: $sgpr58 +; GFX11-NEXT: ; implicit-def: $sgpr76 ; GFX11-NEXT: ; implicit-def: $sgpr86 ; GFX11-NEXT: ; implicit-def: $sgpr85 ; GFX11-NEXT: ; implicit-def: $sgpr84 ; GFX11-NEXT: ; implicit-def: $sgpr83 ; GFX11-NEXT: ; implicit-def: $sgpr82 +; GFX11-NEXT: ; implicit-def: $sgpr94 ; GFX11-NEXT: ; implicit-def: $sgpr81 -; GFX11-NEXT: ; implicit-def: $sgpr61 ; GFX11-NEXT: ; implicit-def: $sgpr80 -; GFX11-NEXT: ; implicit-def: $sgpr60 ; GFX11-NEXT: ; implicit-def: $sgpr71 ; GFX11-NEXT: ; implicit-def: $sgpr70 -; GFX11-NEXT: ; implicit-def: $sgpr58 -; GFX11-NEXT: ; implicit-def: $sgpr59 +; GFX11-NEXT: ; implicit-def: $sgpr69 +; GFX11-NEXT: ; implicit-def: $sgpr45 +; GFX11-NEXT: ; implicit-def: $sgpr44 +; GFX11-NEXT: ; implicit-def: $sgpr43 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr27 +; GFX11-NEXT: ; implicit-def: $sgpr26 +; GFX11-NEXT: ; implicit-def: $sgpr93 +; GFX11-NEXT: ; implicit-def: $sgpr92 ; GFX11-NEXT: ; implicit-def: $sgpr68 ; GFX11-NEXT: ; implicit-def: $sgpr67 ; GFX11-NEXT: ; implicit-def: $sgpr66 +; GFX11-NEXT: ; implicit-def: $sgpr91 +; GFX11-NEXT: ; implicit-def: $sgpr90 ; GFX11-NEXT: ; implicit-def: $sgpr65 ; GFX11-NEXT: ; implicit-def: $sgpr64 ; GFX11-NEXT: ; implicit-def: $sgpr55 ; GFX11-NEXT: ; implicit-def: $sgpr54 +; GFX11-NEXT: ; implicit-def: $sgpr89 +; GFX11-NEXT: ; implicit-def: $sgpr88 ; GFX11-NEXT: ; implicit-def: $sgpr53 ; GFX11-NEXT: ; implicit-def: $sgpr52 ; GFX11-NEXT: ; implicit-def: $sgpr51 +; GFX11-NEXT: ; implicit-def: $sgpr78 ; GFX11-NEXT: ; implicit-def: $sgpr50 ; GFX11-NEXT: ; implicit-def: $sgpr49 ; GFX11-NEXT: ; implicit-def: $sgpr48 -; GFX11-NEXT: ; implicit-def: $sgpr39 -; GFX11-NEXT: ; implicit-def: $sgpr38 -; GFX11-NEXT: ; implicit-def: $sgpr37 ; GFX11-NEXT: ; implicit-def: $sgpr36 -; GFX11-NEXT: ; implicit-def: $sgpr35 -; GFX11-NEXT: ; implicit-def: $sgpr56 -; GFX11-NEXT: ; implicit-def: $sgpr69 ; GFX11-NEXT: ; implicit-def: $sgpr34 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr47 -; GFX11-NEXT: ; implicit-def: $sgpr57 -; GFX11-NEXT: ; implicit-def: $sgpr103 +; GFX11-NEXT: ; implicit-def: $sgpr74 ; GFX11-NEXT: ; implicit-def: $sgpr102 +; GFX11-NEXT: ; implicit-def: $sgpr72 ; GFX11-NEXT: ; implicit-def: $sgpr104 -; GFX11-NEXT: ; implicit-def: $sgpr88 -; GFX11-NEXT: ; implicit-def: $sgpr76 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 6 +; GFX11-NEXT: ; implicit-def: $sgpr100 +; GFX11-NEXT: ; implicit-def: $sgpr30 +; GFX11-NEXT: ; implicit-def: $sgpr98 +; GFX11-NEXT: ; implicit-def: $sgpr96 +; GFX11-NEXT: ; implicit-def: $sgpr38 +; GFX11-NEXT: ; implicit-def: $sgpr77 +; GFX11-NEXT: ; implicit-def: $sgpr39 +; GFX11-NEXT: ; implicit-def: $sgpr37 +; GFX11-NEXT: ; implicit-def: $sgpr75 +; GFX11-NEXT: ; implicit-def: $sgpr35 +; GFX11-NEXT: ; implicit-def: $sgpr73 +; GFX11-NEXT: ; implicit-def: $sgpr103 +; GFX11-NEXT: ; implicit-def: $sgpr101 +; GFX11-NEXT: ; implicit-def: $sgpr99 +; GFX11-NEXT: ; implicit-def: $sgpr97 +; GFX11-NEXT: ; implicit-def: $sgpr31 +; GFX11-NEXT: ; implicit-def: $sgpr95 +; GFX11-NEXT: ; kill: killed $sgpr79 +; GFX11-NEXT: ; implicit-def: $sgpr79 +; GFX11-NEXT: ; kill: killed $sgpr79 +; GFX11-NEXT: v_writelane_b32 v37, s25, 1 +; GFX11-NEXT: v_writelane_b32 v37, s24, 2 +; GFX11-NEXT: ; implicit-def: $sgpr24 +; GFX11-NEXT: ; kill: killed $sgpr24 +; GFX11-NEXT: ; implicit-def: $sgpr24 +; GFX11-NEXT: ; kill: killed $sgpr24 +; GFX11-NEXT: ; implicit-def: $sgpr24 +; GFX11-NEXT: ; kill: killed $sgpr24 +; GFX11-NEXT: ; implicit-def: $sgpr24 +; GFX11-NEXT: v_writelane_b32 v37, s25, 3 +; GFX11-NEXT: ; implicit-def: $sgpr25 +; GFX11-NEXT: v_writelane_b32 v37, s60, 4 +; GFX11-NEXT: ; implicit-def: $sgpr60 +; GFX11-NEXT: v_writelane_b32 v37, s61, 5 +; GFX11-NEXT: v_writelane_b32 v37, s60, 6 +; GFX11-NEXT: ; implicit-def: $sgpr60 +; GFX11-NEXT: v_writelane_b32 v37, s61, 7 +; GFX11-NEXT: v_writelane_b32 v37, s60, 8 +; GFX11-NEXT: ; implicit-def: $sgpr60 +; GFX11-NEXT: v_writelane_b32 v37, s61, 9 +; GFX11-NEXT: v_writelane_b32 v37, s60, 10 +; GFX11-NEXT: ; implicit-def: $sgpr60 +; GFX11-NEXT: v_writelane_b32 v37, s61, 11 +; GFX11-NEXT: v_writelane_b32 v37, s60, 12 +; GFX11-NEXT: ; implicit-def: $sgpr60 +; GFX11-NEXT: v_writelane_b32 v37, s61, 13 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: v_writelane_b32 v37, s62, 14 +; GFX11-NEXT: ; implicit-def: $sgpr62 +; GFX11-NEXT: v_writelane_b32 v37, s63, 15 +; GFX11-NEXT: v_writelane_b32 v37, s62, 16 +; GFX11-NEXT: ; implicit-def: $sgpr62 +; GFX11-NEXT: v_writelane_b32 v37, s63, 17 +; GFX11-NEXT: ; implicit-def: $sgpr63 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 18 ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 8 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 19 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 20 ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 10 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 21 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 22 ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 12 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 23 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 24 ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 14 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 15 +; GFX11-NEXT: ; kill: killed $vcc_lo +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; kill: killed $vcc_lo +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 25 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 26 +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; kill: killed $vcc_lo +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; kill: killed $vcc_lo +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; kill: killed $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 27 ; GFX11-NEXT: .LBB57_3: ; %Flow -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s101 -; GFX11-NEXT: s_mov_b32 s101, s104 -; GFX11-NEXT: s_mov_b32 s104, s57 -; GFX11-NEXT: s_mov_b32 s57, s69 -; GFX11-NEXT: s_mov_b32 s69, s42 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s87 +; GFX11-NEXT: s_mov_b32 s79, s24 +; GFX11-NEXT: s_mov_b32 s87, s25 ; GFX11-NEXT: s_cbranch_vccnz .LBB57_5 ; GFX11-NEXT: ; %bb.4: ; %cmp.true +; GFX11-NEXT: s_add_u32 s56, s56, 3 +; GFX11-NEXT: s_addc_u32 s57, s57, 0 +; GFX11-NEXT: s_add_u32 s46, s46, 3 +; GFX11-NEXT: s_addc_u32 s47, s47, 0 ; GFX11-NEXT: s_add_u32 s40, s40, 3 ; GFX11-NEXT: s_addc_u32 s41, s41, 0 ; GFX11-NEXT: s_add_u32 s28, s28, 3 ; GFX11-NEXT: s_addc_u32 s29, s29, 0 -; GFX11-NEXT: s_add_u32 s26, s26, 3 -; GFX11-NEXT: s_addc_u32 s27, s27, 0 -; GFX11-NEXT: s_add_u32 s24, s24, 3 -; GFX11-NEXT: s_addc_u32 s25, s25, 0 ; GFX11-NEXT: s_add_u32 s22, s22, 3 ; GFX11-NEXT: s_addc_u32 s23, s23, 0 ; GFX11-NEXT: s_add_u32 s20, s20, 3 @@ -85648,348 +85279,316 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 ; GFX11-NEXT: s_add_u32 s0, s0, 3 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 -; GFX11-NEXT: s_lshr_b32 s42, s19, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v37, s42, 16 -; GFX11-NEXT: s_lshr_b32 s42, s18, 16 -; GFX11-NEXT: s_lshr_b32 s101, s1, 24 -; GFX11-NEXT: s_lshr_b32 s102, s1, 16 -; GFX11-NEXT: s_lshr_b32 s103, s1, 8 -; GFX11-NEXT: v_writelane_b32 v37, s42, 17 -; GFX11-NEXT: s_lshr_b32 s42, s18, 8 -; GFX11-NEXT: s_lshr_b32 s104, s0, 16 -; GFX11-NEXT: s_lshr_b32 s47, s0, 8 -; GFX11-NEXT: s_lshr_b32 s46, s3, 24 -; GFX11-NEXT: v_writelane_b32 v37, s42, 18 -; GFX11-NEXT: s_lshr_b32 s42, s21, 24 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s3, 16 -; GFX11-NEXT: s_lshr_b32 s34, s3, 8 -; GFX11-NEXT: s_lshr_b32 s57, s2, 16 -; GFX11-NEXT: v_writelane_b32 v37, s42, 19 -; GFX11-NEXT: s_lshr_b32 s42, s21, 16 -; GFX11-NEXT: s_lshr_b32 s56, s2, 8 -; GFX11-NEXT: s_lshr_b32 s35, s5, 24 -; GFX11-NEXT: s_lshr_b32 s36, s5, 16 -; GFX11-NEXT: v_writelane_b32 v37, s42, 20 -; GFX11-NEXT: s_lshr_b32 s42, s21, 8 -; GFX11-NEXT: s_lshr_b32 s37, s5, 8 -; GFX11-NEXT: s_lshr_b32 s38, s4, 16 -; GFX11-NEXT: s_lshr_b32 s39, s4, 8 -; GFX11-NEXT: v_writelane_b32 v37, s42, 21 -; GFX11-NEXT: s_lshr_b32 s42, s20, 16 -; GFX11-NEXT: s_lshr_b32 s48, s7, 24 -; GFX11-NEXT: s_lshr_b32 s49, s7, 16 -; GFX11-NEXT: s_lshr_b32 s50, s7, 8 -; GFX11-NEXT: v_writelane_b32 v37, s42, 22 -; GFX11-NEXT: s_lshr_b32 s42, s20, 8 -; GFX11-NEXT: s_lshr_b32 s51, s6, 16 -; GFX11-NEXT: s_lshr_b32 s52, s6, 8 -; GFX11-NEXT: s_lshr_b32 s53, s9, 24 -; GFX11-NEXT: v_writelane_b32 v37, s42, 23 -; GFX11-NEXT: s_lshr_b32 s42, s23, 24 -; GFX11-NEXT: s_lshr_b32 s54, s9, 16 -; GFX11-NEXT: s_lshr_b32 s55, s9, 8 -; GFX11-NEXT: s_lshr_b32 s64, s8, 16 -; GFX11-NEXT: v_writelane_b32 v37, s42, 24 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: s_lshr_b32 s65, s8, 8 -; GFX11-NEXT: s_lshr_b32 s66, s11, 24 -; GFX11-NEXT: s_lshr_b32 s67, s11, 16 -; GFX11-NEXT: v_writelane_b32 v37, s42, 25 -; GFX11-NEXT: s_lshr_b32 s42, s23, 8 -; GFX11-NEXT: s_lshr_b32 s68, s11, 8 -; GFX11-NEXT: s_lshr_b32 s59, s10, 16 -; GFX11-NEXT: s_lshr_b32 s58, s10, 8 -; GFX11-NEXT: v_writelane_b32 v37, s42, 26 -; GFX11-NEXT: s_lshr_b32 s42, s22, 16 -; GFX11-NEXT: s_lshr_b32 s70, s13, 24 -; GFX11-NEXT: s_lshr_b32 s71, s13, 16 -; GFX11-NEXT: s_lshr_b32 s60, s13, 8 -; GFX11-NEXT: v_writelane_b32 v37, s42, 27 -; GFX11-NEXT: s_lshr_b32 s42, s22, 8 -; GFX11-NEXT: s_lshr_b32 s80, s12, 16 -; GFX11-NEXT: s_lshr_b32 s61, s12, 8 -; GFX11-NEXT: s_lshr_b32 s81, s15, 24 -; GFX11-NEXT: v_writelane_b32 v37, s42, 28 -; GFX11-NEXT: s_lshr_b32 s42, s25, 24 -; GFX11-NEXT: s_lshr_b32 s82, s15, 16 -; GFX11-NEXT: s_lshr_b32 s83, s15, 8 -; GFX11-NEXT: s_lshr_b32 s84, s14, 16 -; GFX11-NEXT: v_writelane_b32 v37, s42, 29 -; GFX11-NEXT: s_lshr_b32 s42, s25, 16 -; GFX11-NEXT: s_lshr_b32 s85, s14, 8 -; GFX11-NEXT: s_lshr_b32 s86, s17, 24 -; GFX11-NEXT: s_lshr_b32 s72, s17, 16 -; GFX11-NEXT: v_writelane_b32 v37, s42, 30 -; GFX11-NEXT: s_lshr_b32 s42, s25, 8 -; GFX11-NEXT: s_lshr_b32 s87, s17, 8 -; GFX11-NEXT: s_lshr_b32 s73, s16, 16 -; GFX11-NEXT: s_lshr_b32 s96, s16, 8 -; GFX11-NEXT: v_writelane_b32 v37, s42, 31 -; GFX11-NEXT: s_lshr_b32 s42, s24, 16 -; GFX11-NEXT: s_lshr_b32 s97, s19, 24 -; GFX11-NEXT: v_writelane_b32 v36, s42, 0 -; GFX11-NEXT: s_lshr_b32 s42, s24, 8 -; GFX11-NEXT: v_writelane_b32 v37, s62, 14 -; GFX11-NEXT: s_lshr_b32 s69, s19, 16 -; GFX11-NEXT: s_lshr_b32 s74, s28, 16 -; GFX11-NEXT: v_writelane_b32 v36, s42, 1 -; GFX11-NEXT: s_lshr_b32 s42, s27, 24 -; GFX11-NEXT: v_writelane_b32 v37, s63, 15 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[2:3], 24 -; GFX11-NEXT: s_lshr_b32 s43, s28, 8 -; GFX11-NEXT: v_writelane_b32 v36, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s27, 16 -; GFX11-NEXT: v_writelane_b32 v37, s62, 12 -; GFX11-NEXT: s_lshr_b32 s98, s41, 24 -; GFX11-NEXT: s_lshr_b32 s99, s41, 16 -; GFX11-NEXT: v_writelane_b32 v36, s42, 3 -; GFX11-NEXT: s_lshr_b32 s42, s27, 8 -; GFX11-NEXT: v_writelane_b32 v37, s63, 13 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; GFX11-NEXT: s_lshr_b32 s100, s41, 8 -; GFX11-NEXT: v_writelane_b32 v36, s42, 4 -; GFX11-NEXT: s_lshr_b32 s42, s26, 16 -; GFX11-NEXT: v_writelane_b32 v37, s62, 10 -; GFX11-NEXT: s_lshr_b32 s44, s40, 16 -; GFX11-NEXT: s_lshr_b32 s45, s40, 8 -; GFX11-NEXT: v_writelane_b32 v36, s42, 5 -; GFX11-NEXT: s_lshr_b32 s42, s26, 8 -; GFX11-NEXT: v_writelane_b32 v37, s63, 11 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[6:7], 24 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 -; GFX11-NEXT: v_writelane_b32 v36, s42, 6 -; GFX11-NEXT: s_lshr_b32 s42, s29, 24 -; GFX11-NEXT: v_writelane_b32 v37, s62, 8 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[18:19], 24 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[22:23], 24 -; GFX11-NEXT: v_writelane_b32 v36, s42, 7 -; GFX11-NEXT: s_lshr_b32 s42, s29, 16 -; GFX11-NEXT: v_writelane_b32 v37, s63, 9 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[8:9], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; GFX11-NEXT: v_writelane_b32 v36, s42, 8 -; GFX11-NEXT: s_lshr_b32 s42, s29, 8 -; GFX11-NEXT: v_writelane_b32 v37, s62, 6 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[26:27], 24 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[28:29], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v36, s42, 9 -; GFX11-NEXT: v_writelane_b32 v37, s63, 7 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[2:3], 24 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[46:47], 24 +; GFX11-NEXT: v_writelane_b32 v37, s24, 24 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[56:57], 24 +; GFX11-NEXT: s_lshr_b32 s95, s2, 8 +; GFX11-NEXT: s_lshr_b32 s31, s5, 24 +; GFX11-NEXT: s_lshr_b32 s97, s5, 16 +; GFX11-NEXT: v_writelane_b32 v37, s25, 25 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[0:1], 24 +; GFX11-NEXT: s_lshr_b32 s61, s5, 8 +; GFX11-NEXT: s_lshr_b32 s99, s4, 16 +; GFX11-NEXT: s_lshr_b32 s101, s4, 8 +; GFX11-NEXT: v_writelane_b32 v37, s24, 26 +; GFX11-NEXT: s_lshr_b32 s63, s7, 24 +; GFX11-NEXT: s_lshr_b32 s103, s7, 16 +; GFX11-NEXT: s_lshr_b32 s73, s7, 8 +; GFX11-NEXT: s_lshr_b32 s35, s6, 16 +; GFX11-NEXT: v_writelane_b32 v37, s25, 27 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[4:5], 24 +; GFX11-NEXT: s_lshr_b32 s75, s6, 8 +; GFX11-NEXT: s_lshr_b32 s37, s9, 24 +; GFX11-NEXT: s_lshr_b32 s39, s9, 16 +; GFX11-NEXT: v_writelane_b32 v37, s24, 22 +; GFX11-NEXT: s_lshr_b32 s77, s9, 8 +; GFX11-NEXT: s_lshr_b32 s38, s8, 16 +; GFX11-NEXT: s_lshr_b32 s96, s8, 8 +; GFX11-NEXT: s_lshr_b32 s98, s11, 24 +; GFX11-NEXT: v_writelane_b32 v37, s25, 23 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[6:7], 24 +; GFX11-NEXT: s_lshr_b32 s30, s11, 16 +; GFX11-NEXT: s_lshr_b32 s62, s11, 8 +; GFX11-NEXT: s_lshr_b32 s100, s10, 16 +; GFX11-NEXT: v_writelane_b32 v37, s24, 20 +; GFX11-NEXT: s_lshr_b32 s104, s10, 8 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s13, 24 +; GFX11-NEXT: s_lshr_b32 s72, s13, 16 +; GFX11-NEXT: s_lshr_b32 s102, s13, 8 +; GFX11-NEXT: v_writelane_b32 v37, s25, 21 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; GFX11-NEXT: s_lshr_b32 s74, s12, 16 +; GFX11-NEXT: s_lshr_b32 s34, s12, 8 +; GFX11-NEXT: s_lshr_b32 s36, s15, 24 +; GFX11-NEXT: v_writelane_b32 v37, s24, 18 +; GFX11-NEXT: s_lshr_b32 s60, s15, 16 +; GFX11-NEXT: s_lshr_b32 s48, s15, 8 +; GFX11-NEXT: s_lshr_b32 s49, s14, 16 +; GFX11-NEXT: s_lshr_b32 s50, s14, 8 +; GFX11-NEXT: v_writelane_b32 v37, s25, 19 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[10:11], 24 +; GFX11-NEXT: s_lshr_b32 s78, s17, 24 +; GFX11-NEXT: s_lshr_b32 s51, s17, 16 +; GFX11-NEXT: s_lshr_b32 s52, s17, 8 +; GFX11-NEXT: v_writelane_b32 v37, s24, 16 +; GFX11-NEXT: s_lshr_b32 s53, s16, 16 +; GFX11-NEXT: s_lshr_b32 s88, s16, 8 +; GFX11-NEXT: s_lshr_b32 s89, s19, 24 +; GFX11-NEXT: s_lshr_b32 s54, s19, 16 +; GFX11-NEXT: v_writelane_b32 v37, s25, 17 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[12:13], 24 +; GFX11-NEXT: s_lshr_b32 s55, s19, 8 +; GFX11-NEXT: s_lshr_b32 s64, s18, 16 +; GFX11-NEXT: s_lshr_b32 s65, s18, 8 +; GFX11-NEXT: v_writelane_b32 v37, s24, 14 +; GFX11-NEXT: s_lshr_b32 s90, s21, 24 +; GFX11-NEXT: s_lshr_b32 s91, s21, 16 +; GFX11-NEXT: s_lshr_b32 s66, s21, 8 +; GFX11-NEXT: s_lshr_b32 s67, s20, 16 +; GFX11-NEXT: v_writelane_b32 v37, s25, 15 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[14:15], 24 +; GFX11-NEXT: s_lshr_b32 s68, s20, 8 +; GFX11-NEXT: s_lshr_b32 s92, s23, 24 +; GFX11-NEXT: s_lshr_b32 s93, s23, 16 +; GFX11-NEXT: v_writelane_b32 v37, s24, 12 +; GFX11-NEXT: s_lshr_b32 s79, s23, 8 +; GFX11-NEXT: s_lshr_b32 s87, s22, 16 +; GFX11-NEXT: s_lshr_b32 s26, s29, 8 +; GFX11-NEXT: s_lshr_b32 s27, s28, 16 +; GFX11-NEXT: v_writelane_b32 v37, s25, 13 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[16:17], 24 +; GFX11-NEXT: s_lshr_b32 s42, s28, 8 +; GFX11-NEXT: s_lshr_b32 s43, s41, 24 +; GFX11-NEXT: s_lshr_b32 s44, s41, 16 +; GFX11-NEXT: v_writelane_b32 v37, s24, 10 +; GFX11-NEXT: s_lshr_b32 s45, s41, 8 +; GFX11-NEXT: s_lshr_b32 s69, s40, 16 +; GFX11-NEXT: s_lshr_b32 s70, s40, 8 +; GFX11-NEXT: s_lshr_b32 s71, s47, 24 +; GFX11-NEXT: v_writelane_b32 v37, s25, 11 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[18:19], 24 +; GFX11-NEXT: s_lshr_b32 s80, s47, 16 +; GFX11-NEXT: s_lshr_b32 s81, s47, 8 +; GFX11-NEXT: s_lshr_b32 s82, s46, 16 +; GFX11-NEXT: v_writelane_b32 v37, s24, 8 +; GFX11-NEXT: s_lshr_b32 s83, s46, 8 +; GFX11-NEXT: s_lshr_b32 s84, s57, 24 +; GFX11-NEXT: s_lshr_b32 s85, s57, 16 +; GFX11-NEXT: s_lshr_b32 s86, s57, 8 +; GFX11-NEXT: v_writelane_b32 v37, s25, 9 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[20:21], 24 +; GFX11-NEXT: s_lshr_b32 s58, s56, 16 +; GFX11-NEXT: s_lshr_b32 s59, s56, 8 +; GFX11-NEXT: v_writelane_b32 v37, s24, 6 +; GFX11-NEXT: v_writelane_b32 v37, s25, 7 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[22:23], 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v37, s62, 4 -; GFX11-NEXT: v_writelane_b32 v37, s63, 5 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX11-NEXT: v_writelane_b32 v37, s62, 2 -; GFX11-NEXT: v_writelane_b32 v37, s63, 3 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v37, s62, 0 -; GFX11-NEXT: v_writelane_b32 v37, s63, 1 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 +; GFX11-NEXT: v_writelane_b32 v37, s24, 4 +; GFX11-NEXT: v_writelane_b32 v37, s25, 5 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[28:29], 24 +; GFX11-NEXT: v_writelane_b32 v37, s24, 2 +; GFX11-NEXT: v_writelane_b32 v37, s25, 3 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[40:41], 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v37, s24, 0 +; GFX11-NEXT: s_lshr_b32 s24, s1, 24 +; GFX11-NEXT: v_writelane_b32 v37, s25, 1 +; GFX11-NEXT: v_writelane_b32 v37, s24, 28 +; GFX11-NEXT: s_lshr_b32 s24, s1, 16 +; GFX11-NEXT: v_writelane_b32 v37, s24, 29 +; GFX11-NEXT: s_lshr_b32 s24, s1, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v37, s24, 30 +; GFX11-NEXT: s_lshr_b32 s24, s0, 16 +; GFX11-NEXT: v_writelane_b32 v37, s24, 31 +; GFX11-NEXT: s_lshr_b32 s24, s0, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v36, s24, 0 +; GFX11-NEXT: s_lshr_b32 s24, s3, 24 +; GFX11-NEXT: v_writelane_b32 v36, s24, 1 +; GFX11-NEXT: s_lshr_b32 s24, s3, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v36, s24, 2 +; GFX11-NEXT: s_lshr_b32 s24, s3, 8 +; GFX11-NEXT: v_writelane_b32 v36, s24, 3 +; GFX11-NEXT: s_lshr_b32 s24, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v36, s24, 4 +; GFX11-NEXT: s_lshr_b32 s24, s22, 8 +; GFX11-NEXT: v_writelane_b32 v36, s24, 5 +; GFX11-NEXT: s_lshr_b32 s24, s29, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v36, s24, 6 +; GFX11-NEXT: s_lshr_b32 s24, s29, 16 +; GFX11-NEXT: v_writelane_b32 v36, s24, 7 ; GFX11-NEXT: .LBB57_5: ; %end -; GFX11-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-NEXT: s_and_b32 s56, s56, 0xff +; GFX11-NEXT: s_lshl_b32 s59, s59, 8 +; GFX11-NEXT: s_and_b32 s58, s58, 0xff +; GFX11-NEXT: s_or_b32 s56, s56, s59 +; GFX11-NEXT: s_lshl_b32 s59, s76, 8 +; GFX11-NEXT: s_and_b32 s56, s56, 0xffff +; GFX11-NEXT: s_or_b32 s58, s58, s59 +; GFX11-NEXT: s_and_b32 s57, s57, 0xff +; GFX11-NEXT: s_lshl_b32 s58, s58, 16 +; GFX11-NEXT: s_lshl_b32 s59, s84, 8 +; GFX11-NEXT: s_or_b32 s56, s56, s58 +; GFX11-NEXT: s_lshl_b32 s58, s86, 8 +; GFX11-NEXT: s_and_b32 s46, s46, 0xff +; GFX11-NEXT: s_or_b32 s57, s57, s58 +; GFX11-NEXT: s_and_b32 s58, s85, 0xff +; GFX11-NEXT: s_and_b32 s57, s57, 0xffff +; GFX11-NEXT: s_or_b32 s58, s58, s59 +; GFX11-NEXT: s_lshl_b32 s59, s94, 8 +; GFX11-NEXT: s_lshl_b32 s58, s58, 16 +; GFX11-NEXT: s_and_b32 s47, s47, 0xff +; GFX11-NEXT: s_or_b32 s57, s57, s58 +; GFX11-NEXT: s_lshl_b32 s58, s83, 8 +; GFX11-NEXT: v_readlane_b32 s24, v37, 0 +; GFX11-NEXT: s_or_b32 s46, s46, s58 +; GFX11-NEXT: s_and_b32 s58, s82, 0xff +; GFX11-NEXT: s_and_b32 s46, s46, 0xffff +; GFX11-NEXT: s_or_b32 s58, s58, s59 +; GFX11-NEXT: s_lshl_b32 s59, s71, 8 +; GFX11-NEXT: s_lshl_b32 s58, s58, 16 ; GFX11-NEXT: s_and_b32 s28, s28, 0xff -; GFX11-NEXT: s_and_b32 s42, s74, 0xff -; GFX11-NEXT: s_or_b32 s28, s28, s43 -; GFX11-NEXT: s_lshl_b32 s43, s94, 8 -; GFX11-NEXT: s_and_b32 s28, s28, 0xffff -; GFX11-NEXT: s_or_b32 s42, s42, s43 -; GFX11-NEXT: s_and_b32 s29, s29, 0xff -; GFX11-NEXT: s_lshl_b32 s42, s42, 16 -; GFX11-NEXT: v_readlane_b32 s43, v36, 7 +; GFX11-NEXT: s_or_b32 s46, s46, s58 +; GFX11-NEXT: s_lshl_b32 s58, s81, 8 +; GFX11-NEXT: s_lshl_b32 s42, s42, 8 +; GFX11-NEXT: s_or_b32 s47, s47, s58 +; GFX11-NEXT: s_and_b32 s58, s80, 0xff +; GFX11-NEXT: s_and_b32 s47, s47, 0xffff +; GFX11-NEXT: s_or_b32 s58, s58, s59 ; GFX11-NEXT: s_or_b32 s28, s28, s42 -; GFX11-NEXT: v_readlane_b32 s42, v36, 9 -; GFX11-NEXT: s_and_b32 s26, s26, 0xff +; GFX11-NEXT: s_lshl_b32 s58, s58, 16 +; GFX11-NEXT: v_readlane_b32 s42, v37, 2 +; GFX11-NEXT: s_or_b32 s47, s47, s58 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s46 :: v_dual_mov_b32 v4, s47 +; GFX11-NEXT: s_lshl_b32 s47, s24, 8 +; GFX11-NEXT: v_readlane_b32 s24, v36, 7 +; GFX11-NEXT: v_readlane_b32 s25, v37, 1 ; GFX11-NEXT: s_and_b32 s27, s27, 0xff -; GFX11-NEXT: s_lshl_b32 s43, s43, 8 -; GFX11-NEXT: s_and_b32 s24, s24, 0xff ; GFX11-NEXT: s_lshl_b32 s42, s42, 8 -; GFX11-NEXT: s_and_b32 s25, s25, 0xff -; GFX11-NEXT: s_or_b32 s29, s29, s42 -; GFX11-NEXT: v_readlane_b32 s42, v36, 8 -; GFX11-NEXT: s_and_b32 s29, s29, 0xffff +; GFX11-NEXT: s_and_b32 s28, s28, 0xffff +; GFX11-NEXT: s_and_b32 s25, s24, 0xff +; GFX11-NEXT: v_readlane_b32 s24, v36, 6 +; GFX11-NEXT: s_or_b32 s27, s27, s42 +; GFX11-NEXT: s_lshl_b32 s26, s26, 8 +; GFX11-NEXT: s_lshl_b32 s27, s27, 16 ; GFX11-NEXT: s_and_b32 s22, s22, 0xff +; GFX11-NEXT: s_or_b32 s27, s28, s27 +; GFX11-NEXT: s_and_b32 s28, s29, 0xff +; GFX11-NEXT: s_lshl_b32 s24, s24, 8 +; GFX11-NEXT: s_or_b32 s26, s28, s26 +; GFX11-NEXT: s_or_b32 s24, s25, s24 +; GFX11-NEXT: s_and_b32 s25, s26, 0xffff +; GFX11-NEXT: s_lshl_b32 s24, s24, 16 +; GFX11-NEXT: v_readlane_b32 s26, v37, 4 +; GFX11-NEXT: s_or_b32 s24, s25, s24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v7, s27 :: v_dual_mov_b32 v8, s24 +; GFX11-NEXT: v_readlane_b32 s24, v36, 5 +; GFX11-NEXT: s_lshl_b32 s25, s26, 8 +; GFX11-NEXT: s_and_b32 s40, s40, 0xff +; GFX11-NEXT: s_lshl_b32 s46, s70, 8 ; GFX11-NEXT: s_and_b32 s23, s23, 0xff -; GFX11-NEXT: s_and_b32 s20, s20, 0xff -; GFX11-NEXT: s_and_b32 s42, s42, 0xff -; GFX11-NEXT: s_and_b32 s21, s21, 0xff -; GFX11-NEXT: s_or_b32 s42, s42, s43 +; GFX11-NEXT: s_lshl_b32 s24, s24, 8 +; GFX11-NEXT: s_or_b32 s40, s40, s46 +; GFX11-NEXT: s_or_b32 s22, s22, s24 +; GFX11-NEXT: s_and_b32 s24, s87, 0xff +; GFX11-NEXT: s_and_b32 s46, s69, 0xff +; GFX11-NEXT: s_or_b32 s24, s24, s25 +; GFX11-NEXT: s_lshl_b32 s25, s79, 8 +; GFX11-NEXT: s_and_b32 s41, s41, 0xff ; GFX11-NEXT: s_lshl_b32 s45, s45, 8 -; GFX11-NEXT: s_lshl_b32 s42, s42, 16 -; GFX11-NEXT: s_and_b32 s40, s40, 0xff -; GFX11-NEXT: s_or_b32 s29, s29, s42 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s28 :: v_dual_mov_b32 v4, s29 -; GFX11-NEXT: v_readlane_b32 s28, v36, 6 -; GFX11-NEXT: v_readlane_b32 s29, v36, 5 -; GFX11-NEXT: s_or_b32 s40, s40, s45 -; GFX11-NEXT: s_lshl_b32 s45, s30, 8 ; GFX11-NEXT: s_and_b32 s44, s44, 0xff -; GFX11-NEXT: s_lshl_b32 s28, s28, 8 -; GFX11-NEXT: s_and_b32 s29, s29, 0xff -; GFX11-NEXT: s_or_b32 s26, s26, s28 -; GFX11-NEXT: s_lshl_b32 s28, s92, 8 -; GFX11-NEXT: s_and_b32 s26, s26, 0xffff -; GFX11-NEXT: s_or_b32 s28, s29, s28 -; GFX11-NEXT: v_readlane_b32 s29, v36, 2 -; GFX11-NEXT: s_lshl_b32 s28, s28, 16 -; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_or_b32 s26, s26, s28 -; GFX11-NEXT: v_readlane_b32 s28, v36, 4 -; GFX11-NEXT: s_lshl_b32 s29, s29, 8 -; GFX11-NEXT: s_and_b32 s18, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s43, s43, 8 +; GFX11-NEXT: s_or_b32 s23, s23, s25 +; GFX11-NEXT: s_and_b32 s25, s93, 0xff +; GFX11-NEXT: s_lshl_b32 s26, s92, 8 +; GFX11-NEXT: s_or_b32 s46, s46, s47 +; GFX11-NEXT: s_or_b32 s41, s41, s45 +; GFX11-NEXT: s_or_b32 s43, s44, s43 +; GFX11-NEXT: s_or_b32 s25, s25, s26 +; GFX11-NEXT: v_readlane_b32 s26, v37, 6 ; GFX11-NEXT: s_and_b32 s40, s40, 0xffff -; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_lshl_b32 s28, s28, 8 -; GFX11-NEXT: s_or_b32 s40, s40, s44 -; GFX11-NEXT: s_or_b32 s27, s27, s28 -; GFX11-NEXT: v_readlane_b32 s28, v36, 3 -; GFX11-NEXT: s_and_b32 s27, s27, 0xffff -; GFX11-NEXT: s_and_b32 s41, s41, 0xff -; GFX11-NEXT: s_lshl_b32 s44, s100, 8 -; GFX11-NEXT: s_lshl_b32 s45, s98, 8 -; GFX11-NEXT: s_and_b32 s28, s28, 0xff -; GFX11-NEXT: s_or_b32 s41, s41, s44 -; GFX11-NEXT: s_or_b32 s28, s28, s29 -; GFX11-NEXT: v_readlane_b32 s29, v36, 0 -; GFX11-NEXT: s_lshl_b32 s28, s28, 16 -; GFX11-NEXT: s_and_b32 s44, s99, 0xff -; GFX11-NEXT: s_or_b32 s27, s27, s28 -; GFX11-NEXT: v_readlane_b32 s28, v36, 1 -; GFX11-NEXT: s_and_b32 s29, s29, 0xff -; GFX11-NEXT: v_dual_mov_b32 v5, s26 :: v_dual_mov_b32 v6, s27 -; GFX11-NEXT: v_readlane_b32 s26, v37, 19 -; GFX11-NEXT: s_lshl_b32 s28, s28, 8 -; GFX11-NEXT: s_or_b32 s44, s44, s45 -; GFX11-NEXT: s_or_b32 s24, s24, s28 -; GFX11-NEXT: s_lshl_b32 s28, s90, 8 -; GFX11-NEXT: s_and_b32 s24, s24, 0xffff -; GFX11-NEXT: s_or_b32 s28, s29, s28 -; GFX11-NEXT: v_readlane_b32 s29, v37, 29 -; GFX11-NEXT: s_lshl_b32 s28, s28, 16 -; GFX11-NEXT: s_lshl_b32 s26, s26, 8 -; GFX11-NEXT: s_or_b32 s24, s24, s28 -; GFX11-NEXT: v_readlane_b32 s28, v37, 31 -; GFX11-NEXT: s_lshl_b32 s29, s29, 8 -; GFX11-NEXT: s_and_b32 s19, s19, 0xff +; GFX11-NEXT: s_lshl_b32 s46, s46, 16 ; GFX11-NEXT: s_and_b32 s41, s41, 0xffff -; GFX11-NEXT: s_lshl_b32 s44, s44, 16 -; GFX11-NEXT: s_lshl_b32 s28, s28, 8 -; GFX11-NEXT: s_or_b32 s41, s41, s44 -; GFX11-NEXT: s_or_b32 s25, s25, s28 -; GFX11-NEXT: v_readlane_b32 s28, v37, 30 -; GFX11-NEXT: s_and_b32 s25, s25, 0xffff -; GFX11-NEXT: v_dual_mov_b32 v1, s40 :: v_dual_mov_b32 v2, s41 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: s_and_b32 s28, s28, 0xff -; GFX11-NEXT: s_and_b32 s17, s17, 0xff -; GFX11-NEXT: s_or_b32 s28, s28, s29 -; GFX11-NEXT: s_and_b32 s14, s14, 0xff -; GFX11-NEXT: s_lshl_b32 s28, s28, 16 -; GFX11-NEXT: s_and_b32 s15, s15, 0xff -; GFX11-NEXT: s_or_b32 s25, s25, s28 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v7, s24 :: v_dual_mov_b32 v8, s25 -; GFX11-NEXT: v_readlane_b32 s24, v37, 28 -; GFX11-NEXT: v_readlane_b32 s25, v37, 27 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-NEXT: s_and_b32 s12, s12, 0xff -; GFX11-NEXT: s_lshl_b32 s24, s24, 8 -; GFX11-NEXT: s_and_b32 s25, s25, 0xff -; GFX11-NEXT: s_or_b32 s22, s22, s24 -; GFX11-NEXT: s_lshl_b32 s24, s78, 8 +; GFX11-NEXT: s_lshl_b32 s43, s43, 16 ; GFX11-NEXT: s_and_b32 s22, s22, 0xffff -; GFX11-NEXT: s_or_b32 s24, s25, s24 -; GFX11-NEXT: v_readlane_b32 s25, v37, 24 ; GFX11-NEXT: s_lshl_b32 s24, s24, 16 -; GFX11-NEXT: s_and_b32 s13, s13, 0xff -; GFX11-NEXT: s_or_b32 s22, s22, s24 -; GFX11-NEXT: v_readlane_b32 s24, v37, 26 -; GFX11-NEXT: s_lshl_b32 s25, s25, 8 -; GFX11-NEXT: s_and_b32 s10, s10, 0xff -; GFX11-NEXT: s_and_b32 s11, s11, 0xff -; GFX11-NEXT: s_and_b32 s8, s8, 0xff -; GFX11-NEXT: s_lshl_b32 s24, s24, 8 -; GFX11-NEXT: s_and_b32 s9, s9, 0xff -; GFX11-NEXT: s_or_b32 s23, s23, s24 -; GFX11-NEXT: v_readlane_b32 s24, v37, 25 ; GFX11-NEXT: s_and_b32 s23, s23, 0xffff -; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: s_and_b32 s7, s7, 0xff -; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_and_b32 s24, s24, 0xff -; GFX11-NEXT: s_and_b32 s5, s5, 0xff -; GFX11-NEXT: s_or_b32 s24, s24, s25 -; GFX11-NEXT: v_readlane_b32 s25, v37, 22 -; GFX11-NEXT: s_lshl_b32 s24, s24, 16 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_or_b32 s23, s23, s24 -; GFX11-NEXT: v_readlane_b32 s24, v37, 23 -; GFX11-NEXT: s_and_b32 s25, s25, 0xff -; GFX11-NEXT: v_dual_mov_b32 v9, s22 :: v_dual_mov_b32 v10, s23 -; GFX11-NEXT: s_lshl_b32 s22, s88, 8 -; GFX11-NEXT: s_lshl_b32 s24, s24, 8 -; GFX11-NEXT: s_lshl_b32 s23, s97, 8 +; GFX11-NEXT: s_lshl_b32 s25, s25, 16 +; GFX11-NEXT: s_or_b32 s40, s40, s46 +; GFX11-NEXT: s_or_b32 s41, s41, s43 +; GFX11-NEXT: v_dual_mov_b32 v1, s56 :: v_dual_mov_b32 v2, s57 +; GFX11-NEXT: v_readlane_b32 s27, v37, 5 +; GFX11-NEXT: s_or_b32 s22, s22, s24 +; GFX11-NEXT: s_or_b32 s23, s23, s25 +; GFX11-NEXT: s_and_b32 s20, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s24, s68, 8 +; GFX11-NEXT: s_and_b32 s25, s67, 0xff +; GFX11-NEXT: s_lshl_b32 s26, s26, 8 +; GFX11-NEXT: v_dual_mov_b32 v5, s40 :: v_dual_mov_b32 v6, s41 +; GFX11-NEXT: v_readlane_b32 s27, v37, 7 ; GFX11-NEXT: s_or_b32 s20, s20, s24 -; GFX11-NEXT: s_lshl_b32 s24, s62, 8 +; GFX11-NEXT: s_or_b32 s24, s25, s26 +; GFX11-NEXT: s_and_b32 s21, s21, 0xff +; GFX11-NEXT: s_lshl_b32 s25, s66, 8 +; GFX11-NEXT: s_and_b32 s26, s91, 0xff +; GFX11-NEXT: s_lshl_b32 s27, s90, 8 +; GFX11-NEXT: s_or_b32 s21, s21, s25 +; GFX11-NEXT: s_or_b32 s25, s26, s27 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: v_dual_mov_b32 v1, s22 :: v_dual_mov_b32 v2, s23 +; GFX11-NEXT: v_readlane_b32 s22, v37, 8 ; GFX11-NEXT: s_and_b32 s20, s20, 0xffff -; GFX11-NEXT: s_or_b32 s24, s25, s24 -; GFX11-NEXT: v_readlane_b32 s25, v37, 21 ; GFX11-NEXT: s_lshl_b32 s24, s24, 16 -; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_and_b32 s21, s21, 0xffff +; GFX11-NEXT: s_lshl_b32 s25, s25, 16 ; GFX11-NEXT: s_or_b32 s20, s20, s24 -; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s25, s25, 8 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_or_b32 s21, s21, s25 -; GFX11-NEXT: v_readlane_b32 s25, v37, 20 -; GFX11-NEXT: s_and_b32 s21, s21, 0xffff -; GFX11-NEXT: v_readlane_b32 s100, v35, 4 -; GFX11-NEXT: v_readlane_b32 s99, v35, 3 -; GFX11-NEXT: v_readlane_b32 s98, v35, 2 -; GFX11-NEXT: s_and_b32 s25, s25, 0xff -; GFX11-NEXT: v_readlane_b32 s97, v35, 1 -; GFX11-NEXT: s_or_b32 s25, s25, s26 -; GFX11-NEXT: v_readlane_b32 s31, v34, 1 -; GFX11-NEXT: s_lshl_b32 s24, s25, 16 -; GFX11-NEXT: v_readlane_b32 s30, v34, 0 -; GFX11-NEXT: s_or_b32 s21, s21, s24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v12, s21 -; GFX11-NEXT: v_readlane_b32 s20, v37, 18 -; GFX11-NEXT: v_readlane_b32 s21, v37, 17 -; GFX11-NEXT: s_lshl_b32 s20, s20, 8 -; GFX11-NEXT: s_and_b32 s21, s21, 0xff +; GFX11-NEXT: v_dual_mov_b32 v3, s20 :: v_dual_mov_b32 v4, s21 +; GFX11-NEXT: s_and_b32 s18, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s20, s65, 8 +; GFX11-NEXT: s_and_b32 s21, s64, 0xff +; GFX11-NEXT: s_lshl_b32 s22, s22, 8 +; GFX11-NEXT: v_readlane_b32 s23, v37, 9 ; GFX11-NEXT: s_or_b32 s18, s18, s20 ; GFX11-NEXT: s_or_b32 s20, s21, s22 -; GFX11-NEXT: v_readlane_b32 s21, v37, 16 -; GFX11-NEXT: s_and_b32 s22, s69, 0xff -; GFX11-NEXT: s_and_b32 s18, s18, 0xffff -; GFX11-NEXT: s_lshl_b32 s20, s20, 16 -; GFX11-NEXT: v_readlane_b32 s69, v34, 21 -; GFX11-NEXT: s_lshl_b32 s21, s21, 8 -; GFX11-NEXT: s_or_b32 s18, s18, s20 +; GFX11-NEXT: s_and_b32 s19, s19, 0xff +; GFX11-NEXT: s_lshl_b32 s21, s55, 8 +; GFX11-NEXT: s_and_b32 s22, s54, 0xff +; GFX11-NEXT: s_lshl_b32 s23, s89, 8 ; GFX11-NEXT: s_or_b32 s19, s19, s21 ; GFX11-NEXT: s_or_b32 s21, s22, s23 +; GFX11-NEXT: v_readlane_b32 s22, v37, 10 +; GFX11-NEXT: s_and_b32 s18, s18, 0xffff +; GFX11-NEXT: s_lshl_b32 s20, s20, 16 ; GFX11-NEXT: s_and_b32 s19, s19, 0xffff ; GFX11-NEXT: s_lshl_b32 s21, s21, 16 -; GFX11-NEXT: s_lshl_b32 s20, s96, 8 +; GFX11-NEXT: s_or_b32 s18, s18, s20 ; GFX11-NEXT: s_or_b32 s19, s19, s21 -; GFX11-NEXT: s_and_b32 s21, s73, 0xff -; GFX11-NEXT: s_lshl_b32 s22, s76, 8 +; GFX11-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s20, s88, 8 +; GFX11-NEXT: s_and_b32 s21, s53, 0xff +; GFX11-NEXT: s_lshl_b32 s22, s22, 8 +; GFX11-NEXT: v_readlane_b32 s23, v37, 11 ; GFX11-NEXT: s_or_b32 s16, s16, s20 ; GFX11-NEXT: s_or_b32 s20, s21, s22 -; GFX11-NEXT: s_lshl_b32 s21, s87, 8 -; GFX11-NEXT: s_and_b32 s22, s72, 0xff -; GFX11-NEXT: s_lshl_b32 s23, s86, 8 +; GFX11-NEXT: s_and_b32 s17, s17, 0xff +; GFX11-NEXT: s_lshl_b32 s21, s52, 8 +; GFX11-NEXT: s_and_b32 s22, s51, 0xff +; GFX11-NEXT: s_lshl_b32 s23, s78, 8 ; GFX11-NEXT: s_or_b32 s17, s17, s21 ; GFX11-NEXT: s_or_b32 s21, s22, s23 -; GFX11-NEXT: v_dual_mov_b32 v1, s18 :: v_dual_mov_b32 v2, s19 -; GFX11-NEXT: v_readlane_b32 s18, v37, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, s18 :: v_dual_mov_b32 v6, s19 +; GFX11-NEXT: v_readlane_b32 s18, v37, 12 ; GFX11-NEXT: s_and_b32 s16, s16, 0xffff ; GFX11-NEXT: s_lshl_b32 s20, s20, 16 ; GFX11-NEXT: s_and_b32 s17, s17, 0xffff @@ -85997,38 +85596,42 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_or_b32 s16, s16, s20 ; GFX11-NEXT: s_or_b32 s17, s17, s21 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s16 :: v_dual_mov_b32 v4, s17 -; GFX11-NEXT: s_lshl_b32 s16, s85, 8 -; GFX11-NEXT: s_and_b32 s17, s84, 0xff +; GFX11-NEXT: v_dual_mov_b32 v7, s16 :: v_dual_mov_b32 v8, s17 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_lshl_b32 s16, s50, 8 +; GFX11-NEXT: s_and_b32 s17, s49, 0xff ; GFX11-NEXT: s_lshl_b32 s18, s18, 8 -; GFX11-NEXT: v_readlane_b32 s19, v37, 1 +; GFX11-NEXT: v_readlane_b32 s19, v37, 13 ; GFX11-NEXT: s_or_b32 s14, s14, s16 ; GFX11-NEXT: s_or_b32 s16, s17, s18 -; GFX11-NEXT: s_lshl_b32 s17, s83, 8 -; GFX11-NEXT: s_and_b32 s18, s82, 0xff -; GFX11-NEXT: s_lshl_b32 s19, s81, 8 +; GFX11-NEXT: s_and_b32 s15, s15, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s48, 8 +; GFX11-NEXT: s_and_b32 s18, s60, 0xff +; GFX11-NEXT: s_lshl_b32 s19, s36, 8 ; GFX11-NEXT: s_or_b32 s15, s15, s17 ; GFX11-NEXT: s_or_b32 s17, s18, s19 -; GFX11-NEXT: v_readlane_b32 s18, v37, 2 +; GFX11-NEXT: v_readlane_b32 s18, v37, 14 ; GFX11-NEXT: s_and_b32 s14, s14, 0xffff ; GFX11-NEXT: s_lshl_b32 s16, s16, 16 ; GFX11-NEXT: s_and_b32 s15, s15, 0xffff ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 ; GFX11-NEXT: s_or_b32 s14, s14, s16 ; GFX11-NEXT: s_or_b32 s15, s15, s17 -; GFX11-NEXT: s_lshl_b32 s16, s61, 8 -; GFX11-NEXT: s_and_b32 s17, s80, 0xff +; GFX11-NEXT: s_and_b32 s12, s12, 0xff +; GFX11-NEXT: s_lshl_b32 s16, s34, 8 +; GFX11-NEXT: s_and_b32 s17, s74, 0xff ; GFX11-NEXT: s_lshl_b32 s18, s18, 8 -; GFX11-NEXT: v_readlane_b32 s19, v37, 3 +; GFX11-NEXT: v_readlane_b32 s19, v37, 15 ; GFX11-NEXT: s_or_b32 s12, s12, s16 ; GFX11-NEXT: s_or_b32 s16, s17, s18 -; GFX11-NEXT: s_lshl_b32 s17, s60, 8 -; GFX11-NEXT: s_and_b32 s18, s71, 0xff -; GFX11-NEXT: s_lshl_b32 s19, s70, 8 +; GFX11-NEXT: s_and_b32 s13, s13, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s102, 8 +; GFX11-NEXT: s_and_b32 s18, s72, 0xff +; GFX11-NEXT: s_lshl_b32 s19, vcc_hi, 8 ; GFX11-NEXT: s_or_b32 s13, s13, s17 ; GFX11-NEXT: s_or_b32 s17, s18, s19 -; GFX11-NEXT: v_dual_mov_b32 v5, s14 :: v_dual_mov_b32 v6, s15 -; GFX11-NEXT: v_readlane_b32 s14, v37, 4 +; GFX11-NEXT: v_dual_mov_b32 v9, s14 :: v_dual_mov_b32 v10, s15 +; GFX11-NEXT: v_readlane_b32 s14, v37, 16 ; GFX11-NEXT: s_and_b32 s12, s12, 0xffff ; GFX11-NEXT: s_lshl_b32 s16, s16, 16 ; GFX11-NEXT: s_and_b32 s13, s13, 0xffff @@ -86036,41 +85639,45 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_or_b32 s12, s12, s16 ; GFX11-NEXT: s_or_b32 s13, s13, s17 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v7, s12 :: v_dual_mov_b32 v8, s13 -; GFX11-NEXT: s_lshl_b32 s12, s58, 8 -; GFX11-NEXT: s_and_b32 s13, s59, 0xff +; GFX11-NEXT: v_dual_mov_b32 v11, s12 :: v_dual_mov_b32 v12, s13 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s104, 8 +; GFX11-NEXT: s_and_b32 s13, s100, 0xff ; GFX11-NEXT: s_lshl_b32 s14, s14, 8 -; GFX11-NEXT: v_readlane_b32 s15, v37, 5 +; GFX11-NEXT: v_readlane_b32 s15, v37, 17 ; GFX11-NEXT: s_or_b32 s10, s10, s12 ; GFX11-NEXT: s_or_b32 s12, s13, s14 -; GFX11-NEXT: s_lshl_b32 s13, s68, 8 -; GFX11-NEXT: s_and_b32 s14, s67, 0xff -; GFX11-NEXT: s_lshl_b32 s15, s66, 8 +; GFX11-NEXT: s_and_b32 s11, s11, 0xff +; GFX11-NEXT: s_lshl_b32 s13, s62, 8 +; GFX11-NEXT: s_and_b32 s14, s30, 0xff +; GFX11-NEXT: s_lshl_b32 s15, s98, 8 ; GFX11-NEXT: s_or_b32 s11, s11, s13 ; GFX11-NEXT: s_or_b32 s13, s14, s15 -; GFX11-NEXT: v_readlane_b32 s14, v37, 6 +; GFX11-NEXT: v_readlane_b32 s14, v37, 18 ; GFX11-NEXT: s_and_b32 s10, s10, 0xffff ; GFX11-NEXT: s_lshl_b32 s12, s12, 16 ; GFX11-NEXT: s_and_b32 s11, s11, 0xffff ; GFX11-NEXT: s_lshl_b32 s13, s13, 16 ; GFX11-NEXT: s_or_b32 s10, s10, s12 ; GFX11-NEXT: s_or_b32 s11, s11, s13 -; GFX11-NEXT: s_lshl_b32 s12, s65, 8 -; GFX11-NEXT: s_and_b32 s13, s64, 0xff +; GFX11-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-NEXT: s_lshl_b32 s12, s96, 8 +; GFX11-NEXT: s_and_b32 s13, s38, 0xff ; GFX11-NEXT: s_lshl_b32 s14, s14, 8 -; GFX11-NEXT: v_readlane_b32 s15, v37, 7 +; GFX11-NEXT: v_readlane_b32 s15, v37, 19 ; GFX11-NEXT: s_or_b32 s8, s8, s12 ; GFX11-NEXT: s_or_b32 s12, s13, s14 -; GFX11-NEXT: s_lshl_b32 s13, s55, 8 -; GFX11-NEXT: s_and_b32 s14, s54, 0xff -; GFX11-NEXT: s_lshl_b32 s15, s53, 8 +; GFX11-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-NEXT: s_lshl_b32 s13, s77, 8 +; GFX11-NEXT: s_and_b32 s14, s39, 0xff +; GFX11-NEXT: s_lshl_b32 s15, s37, 8 ; GFX11-NEXT: s_or_b32 s9, s9, s13 ; GFX11-NEXT: s_or_b32 s13, s14, s15 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 -; GFX11-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11 -; GFX11-NEXT: v_readlane_b32 s10, v37, 8 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-NEXT: v_dual_mov_b32 v1, s10 :: v_dual_mov_b32 v2, s11 +; GFX11-NEXT: v_readlane_b32 s10, v37, 20 ; GFX11-NEXT: s_and_b32 s8, s8, 0xffff ; GFX11-NEXT: s_lshl_b32 s12, s12, 16 ; GFX11-NEXT: s_and_b32 s9, s9, 0xffff @@ -86078,92 +85685,113 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_or_b32 s8, s8, s12 ; GFX11-NEXT: s_or_b32 s9, s9, s13 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v11, s8 :: v_dual_mov_b32 v12, s9 -; GFX11-NEXT: s_lshl_b32 s8, s52, 8 -; GFX11-NEXT: s_and_b32 s9, s51, 0xff +; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s75, 8 +; GFX11-NEXT: s_and_b32 s9, s35, 0xff ; GFX11-NEXT: s_lshl_b32 s10, s10, 8 -; GFX11-NEXT: v_readlane_b32 s11, v37, 9 +; GFX11-NEXT: v_readlane_b32 s11, v37, 21 ; GFX11-NEXT: s_or_b32 s6, s6, s8 ; GFX11-NEXT: s_or_b32 s8, s9, s10 -; GFX11-NEXT: s_lshl_b32 s9, s50, 8 -; GFX11-NEXT: s_and_b32 s10, s49, 0xff -; GFX11-NEXT: s_lshl_b32 s11, s48, 8 +; GFX11-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s73, 8 +; GFX11-NEXT: s_and_b32 s10, s103, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s63, 8 ; GFX11-NEXT: s_or_b32 s7, s7, s9 ; GFX11-NEXT: s_or_b32 s9, s10, s11 -; GFX11-NEXT: v_readlane_b32 s10, v37, 10 +; GFX11-NEXT: v_readlane_b32 s10, v37, 22 ; GFX11-NEXT: s_and_b32 s6, s6, 0xffff ; GFX11-NEXT: s_lshl_b32 s8, s8, 16 ; GFX11-NEXT: s_and_b32 s7, s7, 0xffff ; GFX11-NEXT: s_lshl_b32 s9, s9, 16 ; GFX11-NEXT: s_or_b32 s6, s6, s8 ; GFX11-NEXT: s_or_b32 s7, s7, s9 -; GFX11-NEXT: s_lshl_b32 s8, s39, 8 -; GFX11-NEXT: s_and_b32 s9, s38, 0xff +; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s101, 8 +; GFX11-NEXT: s_and_b32 s9, s99, 0xff ; GFX11-NEXT: s_lshl_b32 s10, s10, 8 -; GFX11-NEXT: v_readlane_b32 s11, v37, 11 +; GFX11-NEXT: v_readlane_b32 s11, v37, 23 ; GFX11-NEXT: s_or_b32 s4, s4, s8 ; GFX11-NEXT: s_or_b32 s8, s9, s10 -; GFX11-NEXT: s_lshl_b32 s9, s37, 8 -; GFX11-NEXT: s_and_b32 s10, s36, 0xff -; GFX11-NEXT: s_lshl_b32 s11, s35, 8 +; GFX11-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-NEXT: s_lshl_b32 s9, s61, 8 +; GFX11-NEXT: s_and_b32 s10, s97, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s31, 8 ; GFX11-NEXT: s_or_b32 s5, s5, s9 ; GFX11-NEXT: s_or_b32 s9, s10, s11 -; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 -; GFX11-NEXT: v_readlane_b32 s6, v37, 12 ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff ; GFX11-NEXT: s_lshl_b32 s8, s8, 16 ; GFX11-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-NEXT: s_lshl_b32 s9, s9, 16 ; GFX11-NEXT: s_or_b32 s4, s4, s8 ; GFX11-NEXT: s_or_b32 s5, s5, s9 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 -; GFX11-NEXT: s_lshl_b32 s4, s56, 8 -; GFX11-NEXT: s_and_b32 s5, s57, 0xff +; GFX11-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s7 +; GFX11-NEXT: v_dual_mov_b32 v7, s4 :: v_dual_mov_b32 v8, s5 +; GFX11-NEXT: v_readlane_b32 s5, v36, 4 +; GFX11-NEXT: v_readlane_b32 s6, v37, 24 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s95, 8 +; GFX11-NEXT: v_readlane_b32 s7, v37, 25 +; GFX11-NEXT: s_and_b32 s5, s5, 0xff ; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: v_readlane_b32 s7, v37, 13 ; GFX11-NEXT: s_or_b32 s2, s2, s4 ; GFX11-NEXT: s_or_b32 s4, s5, s6 -; GFX11-NEXT: s_lshl_b32 s5, s34, 8 -; GFX11-NEXT: s_and_b32 s6, vcc_hi, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s46, 8 +; GFX11-NEXT: v_readlane_b32 s5, v36, 3 +; GFX11-NEXT: v_readlane_b32 s6, v36, 2 +; GFX11-NEXT: v_readlane_b32 s7, v36, 1 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 ; GFX11-NEXT: s_or_b32 s3, s3, s5 ; GFX11-NEXT: s_or_b32 s5, s6, s7 -; GFX11-NEXT: v_readlane_b32 s6, v37, 14 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s4, s4, 16 ; GFX11-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s4 ; GFX11-NEXT: s_or_b32 s3, s3, s5 -; GFX11-NEXT: s_lshl_b32 s4, s47, 8 -; GFX11-NEXT: s_and_b32 s5, s104, 0xff +; GFX11-NEXT: v_readlane_b32 s4, v36, 0 +; GFX11-NEXT: v_readlane_b32 s5, v37, 31 +; GFX11-NEXT: v_readlane_b32 s6, v37, 26 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: v_readlane_b32 s7, v37, 27 +; GFX11-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-NEXT: s_and_b32 s5, s5, 0xff ; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: v_readlane_b32 s7, v37, 15 ; GFX11-NEXT: s_or_b32 s0, s0, s4 ; GFX11-NEXT: s_or_b32 s4, s5, s6 -; GFX11-NEXT: s_lshl_b32 s5, s103, 8 -; GFX11-NEXT: s_and_b32 s6, s102, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s101, 8 +; GFX11-NEXT: v_readlane_b32 s5, v37, 30 +; GFX11-NEXT: v_readlane_b32 s6, v37, 29 +; GFX11-NEXT: v_readlane_b32 s7, v37, 28 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s5 ; GFX11-NEXT: s_or_b32 s5, s6, s7 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s4, s4, 16 ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s4 ; GFX11-NEXT: s_or_b32 s1, s1, s5 -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 -; GFX11-NEXT: v_dual_mov_b32 v5, s2 :: v_dual_mov_b32 v6, s3 -; GFX11-NEXT: v_dual_mov_b32 v7, s0 :: v_dual_mov_b32 v8, s1 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-NEXT: v_dual_mov_b32 v9, s2 :: v_dual_mov_b32 v10, s3 +; GFX11-NEXT: v_dual_mov_b32 v11, s0 :: v_dual_mov_b32 v12, s1 +; GFX11-NEXT: v_readlane_b32 s43, v37, 3 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:112 ; GFX11-NEXT: v_readlane_b32 s104, v35, 8 ; GFX11-NEXT: v_readlane_b32 s103, v35, 7 ; GFX11-NEXT: v_readlane_b32 s102, v35, 6 ; GFX11-NEXT: v_readlane_b32 s101, v35, 5 +; GFX11-NEXT: v_readlane_b32 s100, v35, 4 +; GFX11-NEXT: v_readlane_b32 s99, v35, 3 +; GFX11-NEXT: v_readlane_b32 s98, v35, 2 +; GFX11-NEXT: v_readlane_b32 s97, v35, 1 ; GFX11-NEXT: v_readlane_b32 s96, v35, 0 ; GFX11-NEXT: v_readlane_b32 s87, v34, 31 ; GFX11-NEXT: v_readlane_b32 s86, v34, 30 @@ -86175,6 +85803,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s80, v34, 24 ; GFX11-NEXT: v_readlane_b32 s71, v34, 23 ; GFX11-NEXT: v_readlane_b32 s70, v34, 22 +; GFX11-NEXT: v_readlane_b32 s69, v34, 21 ; GFX11-NEXT: v_readlane_b32 s68, v34, 20 ; GFX11-NEXT: v_readlane_b32 s67, v34, 19 ; GFX11-NEXT: v_readlane_b32 s66, v34, 18 @@ -86194,6 +85823,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s36, v34, 4 ; GFX11-NEXT: v_readlane_b32 s35, v34, 3 ; GFX11-NEXT: v_readlane_b32 s34, v34, 2 +; GFX11-NEXT: v_readlane_b32 s31, v34, 1 +; GFX11-NEXT: v_readlane_b32 s30, v34, 0 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v34, off, s32 @@ -86240,22 +85871,22 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 @@ -86274,33 +85905,33 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:152 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:160 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:184 -; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v25 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -86357,19 +85988,19 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v32 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 @@ -86377,19 +86008,19 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -86400,7 +86031,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 @@ -86408,27 +86039,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 @@ -86437,15 +86068,15 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 @@ -86459,24 +86090,24 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 @@ -86485,29 +86116,26 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:268 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 @@ -86516,29 +86144,29 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 @@ -86547,29 +86175,29 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 @@ -86578,239 +86206,307 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: v_or_b32_e32 v3, v40, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v4, v45, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v5, v41, v5 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v61, v8 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v55, v8 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v36 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v43 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v63 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v6, v56 +; SI-NEXT: v_or_b32_e32 v6, v6, v34 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v7, v46 +; SI-NEXT: v_or_b32_e32 v7, v7, v38 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v35 -; SI-NEXT: v_or_b32_e32 v8, v8, v44 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v8, v8, v55 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v46 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v51 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v63 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v44 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v42 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v35 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v51 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v45 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -86828,204 +86524,299 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v59 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 ; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; SI-NEXT: v_or_b32_e32 v31, v31, v43 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v32, v33, v32 ; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -87037,351 +86828,189 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: .LBB58_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v45, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v5, v41, v5 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v61, v8 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v55, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v2, v52, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v4, v43, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_or_b32_e32 v5, v63, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v56, v6 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v46, v7 +; SI-NEXT: v_or_b32_e32 v7, v38, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v8, v44, v8 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 @@ -87389,15 +87018,14 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 @@ -87405,12 +87033,12 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 @@ -87418,12 +87046,12 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 @@ -87431,12 +87059,12 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 @@ -87444,12 +87072,14 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 @@ -87481,7 +87111,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -87490,7 +87120,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -87498,7 +87128,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -87507,7 +87137,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -87515,7 +87145,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -87524,7 +87154,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v18 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -87532,7 +87162,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -87541,7 +87171,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v19 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -87549,7 +87179,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -87558,7 +87188,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -87566,7 +87196,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -87575,7 +87205,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -87583,7 +87213,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -87592,15 +87222,15 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -87609,32 +87239,30 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -87643,15 +87271,15 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -87660,15 +87288,15 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -87677,15 +87305,15 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -87694,15 +87322,15 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -87711,15 +87339,15 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -87728,19 +87356,21 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_or_b32_e32 v31, v43, v31 ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v32, v33, v32 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 @@ -87817,16 +87447,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v3 ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v9 @@ -87862,43 +87492,42 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v41 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v43 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill @@ -87908,13 +87537,13 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 @@ -87924,11 +87553,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 @@ -87937,7 +87566,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill @@ -87947,7 +87576,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill @@ -87961,19 +87590,19 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill @@ -87987,25 +87616,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -88013,25 +87642,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -88039,25 +87668,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -88065,15 +87694,15 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 @@ -88082,88 +87711,88 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB58_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v14, v44, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v62, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: v_or_b32_sdwa v15, v41, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -88187,84 +87816,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr61 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -88278,41 +87839,41 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v43, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -88330,11 +87891,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -88347,17 +87908,17 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -88366,289 +87927,359 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: .LBB58_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB58_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v31, 0x300 +; VI-NEXT: v_add_u16_e32 v9, 3, v62 ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v2, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 -; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v4, v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v5, v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 ; VI-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 -; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -88680,11 +88311,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v62 +; VI-NEXT: v_add_u16_e32 v8, 3, v63 ; VI-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 ; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v32 +; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v10, 3, v60 ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -88693,9 +88324,8 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v10, v10, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v9, v9, v10 ; VI-NEXT: v_add_u16_e32 v10, 3, v58 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v11, 3, v56 ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -88717,7 +88347,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v13, 3, v46 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 ; VI-NEXT: v_add_u16_sdwa v13, v13, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v12, v12, v13 @@ -88725,39 +88355,38 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 3, v44 -; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 ; VI-NEXT: v_add_u16_sdwa v14, v14, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v42 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v14, 3, v43 ; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v15, 3, v40 +; VI-NEXT: v_add_u16_e32 v15, 3, v41 ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_add_u16_sdwa v15, v15, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v14, v14, v15 ; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v15, 3, v15 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 @@ -88765,12 +88394,12 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v16, v17 ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v18, 3, v18 @@ -88803,7 +88432,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 @@ -88822,14 +88451,14 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v21, 3, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v21, 0x300, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v22, v22, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 @@ -88856,46 +88485,46 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v24, v24, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v23, v24 ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v24, 3, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 ; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v26, 3, v26 ; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v26, v26, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v25, v26 -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v26, 3, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 ; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v26, v26, v27 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -88907,21 +88536,21 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v28, v28, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v27, v27, v28 -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v28, 3, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v29, v29, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v28, v28, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -88934,7 +88563,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v29, v29, v30 ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v30, 3, v30 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -88946,7 +88575,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v32, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v30, v30, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v32, 3, v32 @@ -88956,7 +88585,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v33, 3, v33 -; VI-NEXT: v_or_b32_sdwa v33, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 ; VI-NEXT: .LBB58_4: ; %end @@ -89033,16 +88662,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 -; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v9 @@ -89088,47 +88717,45 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v32 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v43 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v42 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill @@ -89138,13 +88765,13 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: s_nop 0 @@ -89155,11 +88782,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 ; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 @@ -89168,7 +88795,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill @@ -89180,7 +88807,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill @@ -89195,19 +88822,19 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill @@ -89222,25 +88849,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -89249,25 +88876,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -89276,25 +88903,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -89303,106 +88930,105 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB58_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v62, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_or_b32_sdwa v15, v41, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -89426,84 +89052,16 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -89517,41 +89075,41 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v43, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -89569,11 +89127,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -89586,17 +89144,17 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -89605,296 +89163,363 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: .LBB58_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB58_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v62 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(27) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(27) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 ; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -89926,11 +89551,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v62 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 ; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v32 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v10, 3, v60 ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -89939,7 +89564,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v11, 3, v56 @@ -89962,7 +89587,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v13, 3, v46 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 ; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 @@ -89970,39 +89595,38 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v14, 3, v43 ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v41 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 @@ -90010,12 +89634,12 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 @@ -90048,7 +89672,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 @@ -90067,14 +89691,14 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 ; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 @@ -90101,46 +89725,46 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 ; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v25 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 ; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -90152,21 +89776,21 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -90179,7 +89803,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -90191,7 +89815,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 @@ -90201,7 +89825,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v32, v63, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 ; GFX9-NEXT: .LBB58_4: ; %end @@ -92018,254 +91642,244 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:152 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v25 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v21 -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v17 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v25 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v45 -; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v44 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v43 -; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v42 -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v54 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v0 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -92273,920 +91887,933 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB59_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v59, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 ; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v2, v2, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v61 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: v_mov_b32_e32 v2, v9 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v47, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v29, v1 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v0, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 -; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v43 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v32 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mov_b32_e32 v50, v16 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v16, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: v_mov_b32_e32 v48, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v17, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v18, v1 ; SI-NEXT: v_or_b32_e32 v18, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v19, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 -; SI-NEXT: v_mov_b32_e32 v54, v23 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v20, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v3 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v21, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v22, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v45, v24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v34, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v23, v1 -; SI-NEXT: v_or_b32_e32 v23, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v61 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v24, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v25, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 -; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v26, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v37, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v52, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_or_b32_e32 v27, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_mov_b32_e32 v34, v41 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v46, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v28, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_or_b32_e32 v29, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v30, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v31, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s20, 0xff -; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s22, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s24, 0xff -; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s26, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_branch .LBB59_3 ; SI-NEXT: .LBB59_2: -; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v43 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v34, v41 +; SI-NEXT: v_mov_b32_e32 v33, v60 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB59_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mov_b32_e32 v35, v57 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) expcnt(3) +; SI-NEXT: v_mov_b32_e32 v59, v62 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB59_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_and_b32 s8, s26, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v0, s4, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, s7, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v6, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v7, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v8, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v9, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v10, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v10 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v11, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v11 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v12, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v12 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v13, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v13 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v14, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v50, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v16, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v59, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v17, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v18, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v18 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v19, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v2 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v19 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v57, v4 +; SI-NEXT: v_or_b32_e32 v20, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v21, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v46, v1 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -93194,14 +92821,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -93209,20 +92836,19 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v0 @@ -93268,115 +92894,114 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v23 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 @@ -93387,26 +93012,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v11 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v7 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -93416,807 +93037,814 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:116 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB59_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff ; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_and_b32 s4, s20, 0xff +; VI-NEXT: s_lshl_b32 s5, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff ; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s24, 0xff +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s8, s4, s5 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v47, v1 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v46, v0 -; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v62, v0 -; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v63, v1 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v1 +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_mov_b32_e32 v60, v0 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v35, v0 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v44, v0 +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v34, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v59, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v63, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v51 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v61 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v55, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v37, v53 ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v54, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v53, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v41, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v33 +; VI-NEXT: v_mov_b32_e32 v38, v54 +; VI-NEXT: v_or_b32_sdwa v0, v54, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v48, v63 +; VI-NEXT: v_mov_b32_e32 v46, v33 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v51, v34 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v61, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v56 -; VI-NEXT: v_or_b32_sdwa v0, v56, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v38, v39 -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v53 -; VI-NEXT: v_mov_b32_e32 v52, v36 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v0, v36, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v60 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v1, v33, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v34, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v36, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v62, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v59, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v50, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v58, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v49, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v51, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v57, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v56, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v48, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v39, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v50, v40 -; VI-NEXT: v_mov_b32_e32 v49, v51 -; VI-NEXT: v_mov_b32_e32 v40, v34 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v36, v62 +; VI-NEXT: v_mov_b32_e32 v59, v58 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v62, v32 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 -; VI-NEXT: s_and_b32 s4, s16, 0xff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff -; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_branch .LBB59_3 ; VI-NEXT: .LBB59_2: -; VI-NEXT: v_mov_b32_e32 v44, v56 -; VI-NEXT: v_mov_b32_e32 v41, v33 -; VI-NEXT: v_mov_b32_e32 v50, v40 -; VI-NEXT: v_mov_b32_e32 v38, v39 -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v54, v53 -; VI-NEXT: v_mov_b32_e32 v52, v36 -; VI-NEXT: v_mov_b32_e32 v49, v51 +; VI-NEXT: v_mov_b32_e32 v48, v63 +; VI-NEXT: v_mov_b32_e32 v37, v53 +; VI-NEXT: v_mov_b32_e32 v35, v51 +; VI-NEXT: v_mov_b32_e32 v38, v54 +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v36, v62 +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v46, v33 +; VI-NEXT: v_mov_b32_e32 v50, v32 +; VI-NEXT: v_mov_b32_e32 v59, v58 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB59_3: ; %Flow -; VI-NEXT: v_mov_b32_e32 v51, v41 -; VI-NEXT: v_mov_b32_e32 v36, v44 -; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v54, v60 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v52, v59 +; VI-NEXT: v_mov_b32_e32 v58, v36 +; VI-NEXT: v_mov_b32_e32 v59, v38 ; VI-NEXT: s_cbranch_vccnz .LBB59_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_mov_b32_e32 v33, v35 +; VI-NEXT: v_mov_b32_e32 v35, v37 +; VI-NEXT: v_mov_b32_e32 v37, v48 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v0, s7, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v16 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v17 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v41, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v19, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v2 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v21, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v21 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v46 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v59 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v61 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v58 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v56 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v47 +; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v45 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -94261,128 +93889,130 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:80 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:144 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:176 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v41 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v55 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v45 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v20 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v28 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v30 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 @@ -94392,7 +94022,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; GFX9-NEXT: v_lshlrev_b32_e32 v41, 8, v41 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 @@ -94404,16 +94033,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v7 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -94424,423 +94052,410 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v13 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v5 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 -; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v9 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 -; GFX9-NEXT: s_waitcnt vmcnt(29) -; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:252 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB59_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s29, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s21, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s7, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s8, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v59, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_mov_b32_e32 v61, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_mov_b32_e32 v37, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v47, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v61, v52 +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v35, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v35, v62 -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v40, v30 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v36, v31 -; GFX9-NEXT: v_mov_b32_e32 v45, v62 -; GFX9-NEXT: v_mov_b32_e32 v46, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v53 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v46, v42 +; GFX9-NEXT: v_mov_b32_e32 v53, v51 +; GFX9-NEXT: v_mov_b32_e32 v50, v33 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 +; GFX9-NEXT: v_mov_b32_e32 v42, v36 +; GFX9-NEXT: v_mov_b32_e32 v51, v32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 -; GFX9-NEXT: s_and_b32 s4, s16, 0xff -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s18, 0xff -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s22, 0xff -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_branch .LBB59_3 ; GFX9-NEXT: .LBB59_2: -; GFX9-NEXT: v_mov_b32_e32 v33, v43 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v35, v62 -; GFX9-NEXT: v_mov_b32_e32 v36, v31 -; GFX9-NEXT: v_mov_b32_e32 v40, v30 +; GFX9-NEXT: v_mov_b32_e32 v61, v52 +; GFX9-NEXT: v_mov_b32_e32 v46, v42 +; GFX9-NEXT: v_mov_b32_e32 v53, v51 +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v50, v33 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB59_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v62, v35 -; GFX9-NEXT: v_mov_b32_e32 v35, v38 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB59_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -94884,160 +94499,163 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_and_b32 s8, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s29, 8 ; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 ; GFX9-NEXT: s_movk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, v53 +; GFX9-NEXT: v_mov_b32_e32 v54, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v50 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 @@ -95045,40 +94663,41 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 @@ -95086,153 +94705,159 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 3, v53 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v46 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v62 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v58 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v57 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v56 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -95478,309 +95103,241 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v91 -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v49 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 -; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v78 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v79 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v35 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v88 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v63 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v72 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v62 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v73 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v74 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v75 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v47 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v45 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v58 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v59 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v182 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v56 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v40 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v40 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v41 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v165 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v43 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v44 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v163 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v167 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v16, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v148 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v177 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v146 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v178 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v179 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v18, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v147 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v118 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v150 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v103 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v151 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v160 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v20, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v112 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v101 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v132 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v112 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v132 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v133 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v135 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v84 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v102 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v144 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v144 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v119 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v128 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v25, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v130 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v24, v131 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v84 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v130 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v131 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v80 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v114 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v116 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v27, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v94, 0xff, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v81 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v116 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v69 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v2, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v4, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v94, v89 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v99 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v30, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v65 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff -; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 -; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v2, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_3 @@ -96419,309 +95976,241 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91 -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v49 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 -; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v6, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v78 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v35 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v8, v63 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v33 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v62 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v10, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v47 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v45 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v12, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v182 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v13, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v180 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v14, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v165 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v15, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v163 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v16, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v148 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v177 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v146 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v18, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v118 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v103 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v20, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v101 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v21, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v86 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v22, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v23, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v81 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v24, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v25, v113 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v24, v131 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v2, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v94, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v2, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v4, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v94, v89 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v99 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v30, v31 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v2, v3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_3 @@ -99353,18 +98842,18 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -101916,650 +101405,655 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: v_mov_b32_e32 v43, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v58, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v30, v28 +; SI-NEXT: v_mov_b32_e32 v28, v27 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v41, v23 -; SI-NEXT: v_mov_b32_e32 v29, v20 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v44, v20 +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v59, v29 +; SI-NEXT: v_mov_b32_e32 v60, v24 +; SI-NEXT: v_mov_b32_e32 v41, v22 +; SI-NEXT: v_mov_b32_e32 v22, v13 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v30 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s28 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v37 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v57 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 ; SI-NEXT: s_cbranch_scc0 .LBB63_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_mov_b32_e32 v47, v3 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v0, v19 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v23 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 -; SI-NEXT: v_mov_b32_e32 v1, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 -; SI-NEXT: v_mov_b32_e32 v2, v14 -; SI-NEXT: v_mov_b32_e32 v49, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v3, v16 -; SI-NEXT: v_mov_b32_e32 v20, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v63 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v4, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[3:4], v[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v5, v29 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[4:5], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v5, v9 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 -; SI-NEXT: v_mov_b32_e32 v6, v45 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[5:6], v[9:10], 16 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v57 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 -; SI-NEXT: v_mov_b32_e32 v7, v39 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[6:7], v[8:9], 16 +; SI-NEXT: v_mov_b32_e32 v7, v40 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 -; SI-NEXT: v_mov_b32_e32 v8, v9 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[7:8], v[40:41], 16 +; SI-NEXT: v_mov_b32_e32 v8, v24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_lshr_b64 v[8:9], v[24:25], 16 ; SI-NEXT: v_mov_b32_e32 v9, v54 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_mov_b32_e32 v10, v11 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: v_mov_b32_e32 v10, v53 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 -; SI-NEXT: v_mov_b32_e32 v13, v58 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[53:54], 16 +; SI-NEXT: v_mov_b32_e32 v11, v52 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 -; SI-NEXT: v_mov_b32_e32 v14, v60 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[11:12], v[52:53], 16 +; SI-NEXT: v_mov_b32_e32 v12, v51 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 -; SI-NEXT: v_mov_b32_e32 v15, v62 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[12:13], v[51:52], 16 +; SI-NEXT: v_mov_b32_e32 v13, v39 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 -; SI-NEXT: v_mov_b32_e32 v16, v32 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v14, v32 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; SI-NEXT: v_mov_b32_e32 v40, v17 -; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_mov_b32_e32 v21, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 -; SI-NEXT: v_mov_b32_e32 v22, v31 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 -; SI-NEXT: v_mov_b32_e32 v23, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 -; SI-NEXT: v_mov_b32_e32 v24, v41 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[14:15], v[32:33], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v33, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_mov_b32_e32 v53, v58 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 -; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_lshr_b64 v[16:17], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v17, v62 +; SI-NEXT: v_mov_b32_e32 v19, v63 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[17:18], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v18, v44 +; SI-NEXT: v_mov_b32_e32 v63, v19 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[18:19], v[44:45], 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_mov_b32_e32 v19, v61 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[61:62], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v56 +; SI-NEXT: v_mov_b32_e32 v56, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 +; SI-NEXT: v_mov_b32_e32 v20, v60 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[60:61], 16 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v42 +; SI-NEXT: v_mov_b32_e32 v42, v59 +; SI-NEXT: v_lshr_b64 v[21:22], v[59:60], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v47 +; SI-NEXT: v_lshr_b64 v[22:23], v[58:59], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v58, v36 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v43 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_mov_b32_e32 v23, v31 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v32, v35 ; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 -; SI-NEXT: v_mov_b32_e32 v26, v43 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v46 +; SI-NEXT: v_lshr_b64 v[26:27], v[34:35], 16 +; SI-NEXT: v_mov_b32_e32 v27, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 ; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v57 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: v_lshr_b64 v[28:29], v[50:51], 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_mov_b32_e32 v29, v30 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 -; SI-NEXT: v_mov_b32_e32 v53, v31 -; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[30:31], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v49 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshr_b64 v[34:35], v[49:50], 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[38:39], 16 +; SI-NEXT: v_mov_b32_e32 v31, v34 ; SI-NEXT: s_branch .LBB63_3 ; SI-NEXT: .LBB63_2: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: v_mov_b32_e32 v42, v59 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v53, v58 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: v_mov_b32_e32 v47, v34 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v50 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_mov_b32_e32 v32, v35 +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v33, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v60, v3 +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB63_3: ; %Flow -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mov_b32_e32 v32, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v33, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v53 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v47 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v54, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v36, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v61 ; SI-NEXT: s_cbranch_vccnz .LBB63_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 @@ -102568,83 +102062,72 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 ; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB63_5: ; %end @@ -102674,36 +102157,36 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 ; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 ; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 ; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill @@ -104053,8 +103536,8 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v173, v0 :: v_dual_mov_b32 v174, s29 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_4 @@ -104077,769 +103560,655 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000 ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 ; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s25, 16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v9, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v1.l -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v1.l +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 ; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 ; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 -; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v167 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v177 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v167 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v1, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v7, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v167 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v176 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v176 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v5, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v177 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v178 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v179 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v6 :: v_dual_add_f32 v3, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v177 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v178 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v178 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v181 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v180 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v4, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v179 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v180 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v180 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v183 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v3, v5, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v182 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v3, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v181 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v182 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v182 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v169 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v5, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v168 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v1.l ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v183 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v168 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v168 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v171 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v3, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v170 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v6 :: v_dual_add_f32 v3, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v168.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v169 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v170 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v170 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v171 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v172 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v170 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v169 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v6 :: v_dual_add_nc_u32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v5, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v171 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v173 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v173 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v4, v6 :: v_dual_add_nc_u32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v6 :: v_dual_lshlrev_b32 v6, 16, v173 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v174 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_add_f32 v5, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v174 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v9, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v172 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v172 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_add_f32 v4, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v173, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v173.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v174 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v174 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v2, v7 :: v_dual_add_nc_u32 v7, v8, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v1.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v3.l ; GFX11-TRUE16-NEXT: .LBB63_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v19, v171 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v172 :: v_dual_mov_b32 v17, v174 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v23, v183 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 ; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 @@ -104948,101 +104317,174 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:184 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:56 +; GFX11-FAKE16-NEXT: s_clause 0xd ; 56-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v185, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v188, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v189, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v190, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v191, s32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v190, v13 :: v_dual_mov_b32 v191, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v67, v9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v179, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, v7 :: v_dual_mov_b32 v183, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v189, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v4 :: v_dual_mov_b32 v185, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v1 :: v_dual_mov_b32 v69, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v0 :: v_dual_mov_b32 v181, s29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB63_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v107, s16 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v34, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v140, s2 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:1080 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:1096 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:1112 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1128 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1144 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1160 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1176 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1192 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v144, s3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v114, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s17 :: v_dual_mov_b32 v159, s26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v42, s19 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:952 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:968 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:984 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1000 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1016 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1032 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1048 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1064 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:824 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:840 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:856 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:872 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:888 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:904 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:920 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:936 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s23 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:696 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:712 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:728 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:744 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:760 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:776 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:792 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:808 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, s24 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:632 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:648 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:664 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:680 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s25 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:552 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s27 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:424 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB63_3 ; GFX11-FAKE16-NEXT: .LBB63_2: ; %cmp.true @@ -105050,762 +104492,937 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s24, 16 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v9 :: v_dual_add_nc_u32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:424 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s23, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v159, v0, 16, v1 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s22, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:552 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s21, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:632 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:648 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:664 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:680 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s20, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:696 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:712 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:728 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:744 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:760 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:776 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:792 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:808 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s19, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:824 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:840 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:856 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:872 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:888 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:904 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:920 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:936 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s18, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:952 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:968 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:984 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1000 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1016 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1032 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1048 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1064 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:1080 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:1096 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:1112 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1128 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1144 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1160 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1176 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1192 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v69 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v42, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v114, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 ; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v107, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 ; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v144, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v140, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v4, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v190 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v190 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v1, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_add_nc_u32 v3, v5, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v191 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v7, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v191 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v190, v1, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v3, v5, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v68 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v191, v2, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v7, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v67 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v5, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v179 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v7, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v179 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v68, v1, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v9, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v70 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v8 :: v_dual_and_b32 v6, 0xffff0000, v70 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v2, 16, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v188 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v188 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v6, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v7, v11, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_add_f32 v5, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v183 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v3, v3, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v70, v1, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v180 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v189 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v188, v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v7, 16, v180 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_add_nc_u32 v6, v7, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v189 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_add_nc_u32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v12, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v185 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v10 :: v_dual_add_nc_u32 v5, v11, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v185 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v10 :: v_dual_cndmask_b32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v2, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v10, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v184 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v189, v5, 16, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v182 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v6, v11 :: v_dual_lshlrev_b32 v10, 16, v69 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v181 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v13 :: v_dual_add_nc_u32 v11, v12, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v14, v12 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v16 :: v_dual_add_nc_u32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v13, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v185, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v3, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v11, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v5, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v14, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v7, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v69, v9, 16, v10 ; GFX11-FAKE16-NEXT: .LBB63_3: ; %end -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v34 :: v_dual_mov_b32 v2, v140 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v42 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:1080 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:1096 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:1112 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:1128 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:1144 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:1160 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:1176 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:1192 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v144 :: v_dual_mov_b32 v4, v107 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v76 :: v_dual_mov_b32 v6, v114 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, v184 :: v_dual_mov_b32 v20, v185 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v180 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v183 :: v_dual_mov_b32 v24, v188 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, v32 :: v_dual_mov_b32 v30, v191 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, v190 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, v181 :: v_dual_mov_b32 v18, v182 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v70 :: v_dual_mov_b32 v26, v179 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v67 :: v_dual_mov_b32 v28, v68 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v14, v159 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v69 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v41 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:952 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:968 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:984 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:1000 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:1016 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:1032 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:1048 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:1064 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v42 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:824 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:840 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:856 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:872 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:888 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:904 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:920 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:936 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v43 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:696 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:712 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:728 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:744 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:760 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:776 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:792 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:808 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, v44 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:632 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:648 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:664 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:680 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v45 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:552 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, v46 +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 224-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v191, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v188, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v185, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:92 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:220 +; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:308 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, v48 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB63_4: -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 ; GFX11-FAKE16-NEXT: s_branch .LBB63_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -105846,16 +105463,15 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 @@ -105866,200 +105482,201 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -106071,15 +105688,15 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v62 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill @@ -106123,13 +105740,13 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: .LBB64_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v1 -; SI-NEXT: v_addc_u32_e32 v35, vcc, 0, v2, vcc +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; SI-NEXT: s_waitcnt expcnt(0) @@ -106153,94 +105770,94 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 ; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v63 -; SI-NEXT: v_addc_u32_e32 v46, vcc, 0, v62, vcc -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v46 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; SI-NEXT: v_addc_u32_e32 v42, vcc, 0, v62, vcc +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v31 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v58 -; SI-NEXT: v_mov_b32_e32 v58, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 -; SI-NEXT: v_mov_b32_e32 v34, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 -; SI-NEXT: v_mov_b32_e32 v32, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v46 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 @@ -106258,17 +105875,18 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 @@ -106276,19 +105894,21 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v6 -; SI-NEXT: v_mov_b32_e32 v50, v29 -; SI-NEXT: v_mov_b32_e32 v48, v30 -; SI-NEXT: v_mov_b32_e32 v56, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v2 +; SI-NEXT: v_mov_b32_e32 v48, v29 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_mov_b32_e32 v36, v31 +; SI-NEXT: v_mov_b32_e32 v46, v28 +; SI-NEXT: v_mov_b32_e32 v63, v8 ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -106296,16 +105916,15 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: .LBB64_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 @@ -106316,21 +105935,21 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -106417,7 +106036,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -106434,8 +106053,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -106511,7 +106130,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -106523,7 +106142,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -106545,7 +106164,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -106554,7 +106173,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -106562,22 +106181,22 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -106590,8 +106209,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -106929,9 +106548,9 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: s_lshr_b32 s4, s42, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 ; SI-NEXT: s_lshr_b32 s4, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 ; SI-NEXT: s_lshr_b32 s4, s44, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 @@ -106962,7 +106581,7 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v42, s47 ; SI-NEXT: v_cvt_f32_f16_e32 v44, s43 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v60, s45 ; SI-NEXT: v_cvt_f32_f16_e32 v62, s41 ; SI-NEXT: s_waitcnt expcnt(0) @@ -107065,7 +106684,7 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v42, s47 ; SI-NEXT: v_cvt_f32_f16_e32 v44, s43 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s42 ; SI-NEXT: v_cvt_f32_f16_e32 v60, s44 ; SI-NEXT: v_cvt_f32_f16_e32 v62, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 @@ -107098,8 +106717,8 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v43, s60 ; SI-NEXT: v_cvt_f32_f16_e32 v45, s59 ; SI-NEXT: v_cvt_f32_f16_e32 v47, s58 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s57 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s57 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s45 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s40 ; SI-NEXT: v_cvt_f32_f16_e32 v61, s29 ; SI-NEXT: .LBB65_3: ; %end @@ -107121,22 +106740,22 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v57, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v56, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v56, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 ; SI-NEXT: v_add_i32_e32 v47, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -107363,10 +106982,10 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 @@ -108113,218 +107732,210 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB66_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -108332,27 +107943,37 @@ define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload @@ -108705,22 +108326,23 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v28 ; SI-NEXT: v_mov_b32_e32 v53, v26 -; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v41, v6 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) @@ -108732,40 +108354,40 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 ; SI-NEXT: v_mov_b32_e32 v54, v14 ; SI-NEXT: v_mov_b32_e32 v55, v12 -; SI-NEXT: v_mov_b32_e32 v41, v11 -; SI-NEXT: v_mov_b32_e32 v40, v10 -; SI-NEXT: v_mov_b32_e32 v44, v9 -; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_mov_b32_e32 v44, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 @@ -108776,25 +108398,25 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v48 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v42 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -108806,68 +108428,77 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB67_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_or_b32_e32 v19, v54, v19 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_mov_b32_e32 v34, v53 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_mov_b32_e32 v53, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: v_mov_b32_e32 v52, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v51, v22 ; SI-NEXT: v_mov_b32_e32 v51, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 @@ -108889,82 +108520,76 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v27, v38, v27 ; SI-NEXT: v_mov_b32_e32 v38, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v28, v37, v28 ; SI-NEXT: v_mov_b32_e32 v37, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v8, v11, v8 ; SI-NEXT: v_or_b32_e32 v9, v14, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_or_b32_e32 v19, v54, v19 -; SI-NEXT: v_mov_b32_e32 v54, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 -; SI-NEXT: v_or_b32_e32 v2, v60, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_or_b32_e32 v12, v61, v12 ; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_or_b32_e32 v13, v59, v13 ; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_or_b32_e32 v13, v57, v13 ; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_or_b32_e32 v14, v47, v14 ; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v45, v15 ; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v15, v43, v15 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v40, v17 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_or_b32_e32 v18, v55, v18 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_mov_b32_e32 v42, v55 +; SI-NEXT: v_or_b32_e32 v17, v55, v17 +; SI-NEXT: v_mov_b32_e32 v33, v40 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 ; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_mov_b32_e32 v32, v35 +; SI-NEXT: v_or_b32_e32 v31, v63, v31 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB67_3 ; SI-NEXT: .LBB67_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v62, v61 ; SI-NEXT: v_mov_b32_e32 v60, v59 ; SI-NEXT: v_mov_b32_e32 v58, v57 ; SI-NEXT: v_mov_b32_e32 v56, v47 ; SI-NEXT: v_mov_b32_e32 v46, v45 ; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v42, v55 +; SI-NEXT: v_mov_b32_e32 v33, v40 +; SI-NEXT: v_mov_b32_e32 v36, v54 ; SI-NEXT: v_mov_b32_e32 v54, v20 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v34, v53 +; SI-NEXT: v_mov_b32_e32 v32, v35 +; SI-NEXT: v_mov_b32_e32 v53, v21 +; SI-NEXT: v_mov_b32_e32 v52, v22 ; SI-NEXT: v_mov_b32_e32 v51, v23 ; SI-NEXT: v_mov_b32_e32 v50, v24 ; SI-NEXT: v_mov_b32_e32 v49, v25 @@ -108975,298 +108600,290 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB67_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v35, v40 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v40, v46 -; SI-NEXT: v_mov_b32_e32 v41, v56 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v43, v60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v63 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v42, v56 +; SI-NEXT: v_mov_b32_e32 v43, v58 +; SI-NEXT: v_mov_b32_e32 v44, v60 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB67_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v47 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_mov_b32_e32 v55, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v55 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v34 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v58 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v59 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v35 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 @@ -109279,7 +108896,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 @@ -109296,7 +108913,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 ; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -109304,7 +108921,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 @@ -109334,7 +108951,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v16i64_scalar: @@ -109565,252 +109182,214 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v160, v13 :: v_dual_mov_b32 v161, v12 +; GFX11-NEXT: v_dual_mov_b32 v162, v11 :: v_dual_mov_b32 v163, v10 +; GFX11-NEXT: v_dual_mov_b32 v164, v9 :: v_dual_mov_b32 v165, v8 +; GFX11-NEXT: v_dual_mov_b32 v166, v7 :: v_dual_mov_b32 v167, v6 +; GFX11-NEXT: v_dual_mov_b32 v176, v5 :: v_dual_mov_b32 v177, v4 +; GFX11-NEXT: v_dual_mov_b32 v178, v3 :: v_dual_mov_b32 v179, v2 +; GFX11-NEXT: v_dual_mov_b32 v180, v1 :: v_dual_mov_b32 v181, v0 +; GFX11-NEXT: v_dual_mov_b32 v182, s28 :: v_dual_mov_b32 v183, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:96 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 ; GFX11-NEXT: s_cbranch_scc0 .LBB67_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v27, s18 +; GFX11-NEXT: v_dual_mov_b32 v20, s17 :: v_dual_mov_b32 v35, s19 +; GFX11-NEXT: v_dual_mov_b32 v44, s20 :: v_dual_mov_b32 v65, s22 +; GFX11-NEXT: v_dual_mov_b32 v54, s21 :: v_dual_mov_b32 v77, s23 +; GFX11-NEXT: v_dual_mov_b32 v90, s24 :: v_dual_mov_b32 v119, s26 +; GFX11-NEXT: v_dual_mov_b32 v104, s25 :: v_dual_mov_b32 v135, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB67_3 ; GFX11-NEXT: .LBB67_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v135, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v119, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v104, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v90, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v77, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v65, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v54, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v44, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v35, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v160, 0x200, v160 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v161, 0x200, v161 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v162, 0x200, v162 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v163, 0x200, v163 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v164, 0x200, v164 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v165, 0x200, v165 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v166, 0x200, v166 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v167, 0x200, v167 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB67_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 +; GFX11-NEXT: v_mov_b32_e32 v13, v104 ; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:220 +; GFX11-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14 +; GFX11-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 +; GFX11-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65 +; GFX11-NEXT: v_mov_b32_e32 v14, v119 +; GFX11-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v182 +; GFX11-NEXT: v_dual_mov_b32 v17, v183 :: v_dual_mov_b32 v18, v181 +; GFX11-NEXT: v_dual_mov_b32 v19, v180 :: v_dual_mov_b32 v20, v179 +; GFX11-NEXT: v_dual_mov_b32 v21, v178 :: v_dual_mov_b32 v22, v177 +; GFX11-NEXT: v_dual_mov_b32 v23, v176 :: v_dual_mov_b32 v24, v167 +; GFX11-NEXT: v_dual_mov_b32 v25, v166 :: v_dual_mov_b32 v26, v165 +; GFX11-NEXT: v_dual_mov_b32 v27, v164 :: v_dual_mov_b32 v28, v163 +; GFX11-NEXT: v_dual_mov_b32 v29, v162 :: v_dual_mov_b32 v30, v161 +; GFX11-NEXT: v_mov_b32_e32 v31, v160 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB67_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 ; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109 +; GFX11-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122 +; GFX11-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136 +; GFX11-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151 ; GFX11-NEXT: s_branch .LBB67_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -110572,43 +110151,46 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 ; SI-NEXT: s_lshr_b64 s[36:37], s[56:57], 16 ; SI-NEXT: .LBB69_3: ; %end -; SI-NEXT: s_lshl_b32 s27, s36, 16 -; SI-NEXT: s_and_b32 s29, s56, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s56, 0xffff +; SI-NEXT: s_lshl_b32 s29, s36, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v1, s27 ; SI-NEXT: s_and_b32 s27, s57, 0xffff ; SI-NEXT: s_lshl_b32 s29, s69, 16 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s34, 16 -; SI-NEXT: s_and_b32 s29, s46, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: s_and_b32 s27, s47, 0xffff -; SI-NEXT: s_lshl_b32 s29, s68, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: s_lshl_b32 s27, s30, 16 -; SI-NEXT: s_and_b32 s29, s44, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s46, 0xffff +; SI-NEXT: s_lshl_b32 s29, s34, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v5, s27 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s47, 0xffff +; SI-NEXT: s_lshl_b32 s29, s68, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_and_b32 s27, s45, 0xffff -; SI-NEXT: s_lshl_b32 s29, s67, 16 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s44, 0xffff +; SI-NEXT: s_lshl_b32 s29, s30, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s45, 0xffff +; SI-NEXT: s_lshl_b32 s29, s67, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s27 ; SI-NEXT: s_and_b32 s27, s42, 0xffff ; SI-NEXT: s_lshl_b32 s29, s94, 16 -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -110844,12 +110426,12 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr39 -; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: s_branch .LBB69_2 ; ; VI-LABEL: bitcast_v16i64_to_v64i16_scalar: @@ -111169,10 +110751,10 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(13) @@ -111392,8 +110974,8 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v11, v11, v44 ; SI-NEXT: v_or_b32_e32 v12, v12, v43 ; SI-NEXT: v_or_b32_e32 v13, v13, v42 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v55 +; SI-NEXT: v_or_b32_e32 v14, v14, v55 +; SI-NEXT: v_or_b32_e32 v15, v15, v40 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr52 @@ -111414,8 +110996,8 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 @@ -111593,8 +111175,8 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v11, v44, v11 ; SI-NEXT: v_or_b32_e32 v12, v43, v12 ; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v55, v15 +; SI-NEXT: v_or_b32_e32 v14, v55, v14 +; SI-NEXT: v_or_b32_e32 v15, v40, v15 ; SI-NEXT: v_or_b32_e32 v19, v39, v19 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -111925,222 +111507,207 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v12 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v35, v8 -; SI-NEXT: v_mov_b32_e32 v38, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_mov_b32_e32 v46, v10 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v32 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v55 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v54 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB71_2 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB71_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v7, v0, v48 +; SI-NEXT: v_mov_b32_e32 v60, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v8, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v39 +; SI-NEXT: v_or_b32_e32 v9, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v11, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v12, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v13, v0, v44 +; SI-NEXT: v_or_b32_e32 v10, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v13, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v43 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v16, v0, v42 +; SI-NEXT: v_or_b32_e32 v16, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v18, v0, v41 +; SI-NEXT: v_or_b32_e32 v18, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v19, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v20, v0, v37 +; SI-NEXT: v_or_b32_e32 v20, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v22, v0, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 ; SI-NEXT: v_or_b32_e32 v24, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 ; SI-NEXT: v_or_b32_e32 v27, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v28, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v29, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: s_or_b32 s12, s4, s5 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_or_b32_e32 v8, v1, v56 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v55, v61 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: v_or_b32_e32 v31, v0, v31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_branch .LBB71_3 -; SI-NEXT: .LBB71_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v55, v61 -; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB71_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v58, v49 -; SI-NEXT: s_cbranch_vccnz .LBB71_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_mov_b32_e32 v54, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v48 +; SI-NEXT: v_mov_b32_e32 v61, v39 +; SI-NEXT: v_mov_b32_e32 v59, v37 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v53, v35 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v63, v56 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -112185,139 +111752,134 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -112326,7 +111888,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB71_5: ; %end +; SI-NEXT: .LBB71_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -112343,8 +111905,26 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB71_4: +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v48 +; SI-NEXT: v_mov_b32_e32 v61, v39 +; SI-NEXT: v_mov_b32_e32 v60, v0 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v59, v37 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v54, v4 +; SI-NEXT: v_mov_b32_e32 v53, v35 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v63, v56 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB71_2 ; ; VI-LABEL: bitcast_v64i16_to_v16i64_scalar: ; VI: ; %bb.0: @@ -112681,252 +112261,214 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v160, v13 :: v_dual_mov_b32 v161, v12 +; GFX11-NEXT: v_dual_mov_b32 v162, v11 :: v_dual_mov_b32 v163, v10 +; GFX11-NEXT: v_dual_mov_b32 v164, v9 :: v_dual_mov_b32 v165, v8 +; GFX11-NEXT: v_dual_mov_b32 v166, v7 :: v_dual_mov_b32 v167, v6 +; GFX11-NEXT: v_dual_mov_b32 v176, v5 :: v_dual_mov_b32 v177, v4 +; GFX11-NEXT: v_dual_mov_b32 v178, v3 :: v_dual_mov_b32 v179, v2 +; GFX11-NEXT: v_dual_mov_b32 v180, v1 :: v_dual_mov_b32 v181, v0 +; GFX11-NEXT: v_dual_mov_b32 v182, s28 :: v_dual_mov_b32 v183, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:96 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 ; GFX11-NEXT: s_cbranch_scc0 .LBB71_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v27, s18 +; GFX11-NEXT: v_dual_mov_b32 v20, s17 :: v_dual_mov_b32 v35, s19 +; GFX11-NEXT: v_dual_mov_b32 v44, s20 :: v_dual_mov_b32 v65, s22 +; GFX11-NEXT: v_dual_mov_b32 v54, s21 :: v_dual_mov_b32 v77, s23 +; GFX11-NEXT: v_dual_mov_b32 v90, s24 :: v_dual_mov_b32 v119, s26 +; GFX11-NEXT: v_dual_mov_b32 v104, s25 :: v_dual_mov_b32 v135, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB71_3 ; GFX11-NEXT: .LBB71_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v135, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v119, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v104, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v90, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v77, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v65, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v54, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v44, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v35, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v160, v160, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v161, v161, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v162, v162, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v163, v163, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v164, v164, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v165, v165, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v166, v166, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v167, v167, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB71_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 +; GFX11-NEXT: v_mov_b32_e32 v13, v104 ; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:220 +; GFX11-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14 +; GFX11-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 +; GFX11-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65 +; GFX11-NEXT: v_mov_b32_e32 v14, v119 +; GFX11-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v182 +; GFX11-NEXT: v_dual_mov_b32 v17, v183 :: v_dual_mov_b32 v18, v181 +; GFX11-NEXT: v_dual_mov_b32 v19, v180 :: v_dual_mov_b32 v20, v179 +; GFX11-NEXT: v_dual_mov_b32 v21, v178 :: v_dual_mov_b32 v22, v177 +; GFX11-NEXT: v_dual_mov_b32 v23, v176 :: v_dual_mov_b32 v24, v167 +; GFX11-NEXT: v_dual_mov_b32 v25, v166 :: v_dual_mov_b32 v26, v165 +; GFX11-NEXT: v_dual_mov_b32 v27, v164 :: v_dual_mov_b32 v28, v163 +; GFX11-NEXT: v_dual_mov_b32 v29, v162 :: v_dual_mov_b32 v30, v161 +; GFX11-NEXT: v_mov_b32_e32 v31, v160 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB71_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 ; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109 +; GFX11-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122 +; GFX11-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136 +; GFX11-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151 ; GFX11-NEXT: s_branch .LBB71_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -113097,19 +112639,19 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 @@ -113117,15 +112659,15 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(14) @@ -113205,28 +112747,28 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill @@ -113241,7 +112783,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill @@ -113250,7 +112792,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill @@ -113268,7 +112810,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill @@ -113277,22 +112819,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill @@ -113301,7 +112843,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill @@ -113313,7 +112855,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill @@ -113322,32 +112864,32 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 -; SI-NEXT: v_alignbit_b32 v39, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v49, v6, v5, 8 -; SI-NEXT: v_alignbit_b32 v52, v4, v3, 24 -; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v40, v4, v3, 8 -; SI-NEXT: v_alignbit_b32 v42, v2, v1, 24 -; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v46, v2, v1, 8 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v52, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v41, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v1, 8 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 -; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v10 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v10 ; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 @@ -113444,31 +112986,31 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill @@ -113484,7 +113026,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill @@ -113493,7 +113035,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill @@ -113511,7 +113053,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill @@ -113520,22 +113062,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill @@ -113544,7 +113086,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill @@ -113556,7 +113098,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill @@ -113568,32 +113110,32 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; SI-NEXT: v_alignbit_b32 v37, v6, v5, 24 -; SI-NEXT: v_alignbit_b32 v39, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v49, v6, v5, 8 -; SI-NEXT: v_alignbit_b32 v52, v4, v3, 24 -; SI-NEXT: v_alignbit_b32 v54, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v40, v4, v3, 8 -; SI-NEXT: v_alignbit_b32 v42, v2, v1, 24 -; SI-NEXT: v_alignbit_b32 v44, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v46, v2, v1, 8 +; SI-NEXT: v_alignbit_b32 v38, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v51, v4, v3, 24 +; SI-NEXT: v_alignbit_b32 v52, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v55, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v41, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v45, v2, v1, 8 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 -; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v10 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v10 ; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v8 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v8 @@ -113609,15 +113151,15 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v2 ; SI-NEXT: .LBB72_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v46 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v42 -; SI-NEXT: v_or_b32_e32 v42, v42, v44 +; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v45 +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v45 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v41 +; SI-NEXT: v_or_b32_e32 v41, v41, v43 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v41 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 @@ -113632,12 +113174,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -113656,10 +113198,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -113679,13 +113221,13 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -113709,13 +113251,13 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -113729,23 +113271,23 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v46 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -113759,24 +113301,24 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v53 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -113791,22 +113333,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v39 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -113820,7 +113362,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -113835,13 +113377,13 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -113856,7 +113398,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -113925,7 +113467,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload @@ -113961,14 +113503,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -113997,10 +113539,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -114036,7 +113578,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -114146,10 +113688,68 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -114161,6 +113761,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -114171,6 +113774,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -114181,6 +113787,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -114191,6 +113800,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -114201,6 +113813,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -114211,6 +113826,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -114221,6 +113839,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -114231,6 +113852,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 @@ -114241,111 +113865,26 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; kill: killed $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB72_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 @@ -114360,162 +113899,169 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[29:30] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[23:24] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[21:22] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[15:16] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[13:14] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[11:12] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[9:10] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v52, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_mov_b32_e32 v41, v38 +; VI-NEXT: v_lshrrev_b64 v[38:39], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 +; VI-NEXT: v_mov_b32_e32 v39, v55 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v1 ; VI-NEXT: .LBB72_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB72_4 @@ -114553,37 +114099,37 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] -; VI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; VI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] -; VI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; VI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[38:39], 24, v[1:2] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] -; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] -; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v11 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 @@ -114598,210 +114144,214 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v52, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v53, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v1 ; VI-NEXT: .LBB72_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v54 ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v55 -; VI-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v41 ; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v54, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v38 +; VI-NEXT: v_or_b32_sdwa v55, v60, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v53 +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v53 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v44 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v52 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v36 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v51 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 -; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v60 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v34 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 -; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -114812,23 +114362,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -114839,23 +114388,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -114866,23 +114414,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -114893,23 +114440,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -114920,10 +114466,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -114933,9 +114479,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -114946,10 +114492,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -114960,9 +114506,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -114973,10 +114519,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -114987,9 +114533,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -115000,10 +114546,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -115014,11 +114560,13 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -115066,16 +114614,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v42 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -115119,402 +114663,393 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; kill: killed $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; kill: killed $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; kill: killed $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; kill: killed $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; kill: killed $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; kill: killed $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; kill: killed $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; kill: killed $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; kill: killed $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; kill: killed $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(47) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; kill: killed $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB72_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(49) +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(44) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[29:30] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[27:28] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v1 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v42, v55 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v1 ; GFX9-NEXT: .LBB72_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB72_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: s_waitcnt vmcnt(46) +; GFX9-NEXT: s_waitcnt vmcnt(42) ; GFX9-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 @@ -115564,12 +115099,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v16 +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v15 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill @@ -115579,347 +115114,330 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v11 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v1 ; GFX9-NEXT: .LBB72_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v40 -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v44 -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v38 -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v43 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v63 -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v55 -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v53 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v12, v12, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v36 -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v57 -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v48 -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v46 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v41 -; GFX9-NEXT: v_or_b32_sdwa v34, v54, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v54, v54, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -115929,11 +115447,11 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -115942,10 +115460,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -115955,11 +115473,11 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -115968,10 +115486,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -115981,11 +115499,11 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -115994,10 +115512,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -116007,11 +115525,11 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -116020,10 +115538,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -116033,11 +115551,11 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -116046,10 +115564,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -116059,11 +115577,11 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -116072,15 +115590,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v42 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -116343,17 +115858,17 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v161.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l @@ -116547,28 +116062,27 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:12 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 @@ -116654,6 +116168,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 @@ -116662,7 +116177,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 @@ -116671,18 +116185,19 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 @@ -116764,10 +116279,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: .LBB72_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB72_4 @@ -116884,56 +116398,52 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-FAKE16-NEXT: .LBB72_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v75 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v63 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v63 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v61 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v66 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, v67, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v65 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v61 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v62 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v60 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v67, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v59 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v58 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v65 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v57 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v56 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v65 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v66, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v47 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v46 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v45 @@ -116942,22 +116452,26 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v64 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v65, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v44 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v43 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v64 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v65, v54 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v42 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v41 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v40 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v65 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v182 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v183 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 @@ -117187,27 +116701,26 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:84 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -117232,9 +116745,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_writelane_b32 v63, s30, 0 @@ -117437,55 +116950,55 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 ; SI-NEXT: v_writelane_b32 v62, s26, 32 ; SI-NEXT: v_writelane_b32 v62, s27, 33 -; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 ; SI-NEXT: v_writelane_b32 v62, s26, 30 ; SI-NEXT: v_writelane_b32 v62, s27, 31 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 8 ; SI-NEXT: v_writelane_b32 v62, s26, 28 ; SI-NEXT: v_writelane_b32 v62, s27, 29 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 ; SI-NEXT: v_writelane_b32 v62, s26, 26 ; SI-NEXT: v_writelane_b32 v62, s27, 27 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 ; SI-NEXT: v_writelane_b32 v62, s26, 24 ; SI-NEXT: v_writelane_b32 v62, s27, 25 -; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 ; SI-NEXT: v_writelane_b32 v62, s26, 22 ; SI-NEXT: v_writelane_b32 v62, s27, 23 -; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 24 ; SI-NEXT: v_writelane_b32 v62, s26, 20 ; SI-NEXT: v_writelane_b32 v62, s27, 21 -; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 16 ; SI-NEXT: v_writelane_b32 v62, s26, 18 ; SI-NEXT: v_writelane_b32 v62, s27, 19 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[8:9], 8 ; SI-NEXT: v_writelane_b32 v62, s26, 16 ; SI-NEXT: v_writelane_b32 v62, s27, 17 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 24 ; SI-NEXT: v_writelane_b32 v62, s26, 14 ; SI-NEXT: v_writelane_b32 v62, s27, 15 -; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 16 ; SI-NEXT: v_writelane_b32 v62, s26, 12 ; SI-NEXT: v_writelane_b32 v62, s27, 13 -; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 ; SI-NEXT: v_writelane_b32 v62, s26, 10 ; SI-NEXT: v_writelane_b32 v62, s27, 11 -; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 24 ; SI-NEXT: v_writelane_b32 v62, s26, 8 ; SI-NEXT: v_writelane_b32 v62, s27, 9 -; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 16 ; SI-NEXT: v_writelane_b32 v62, s26, 6 ; SI-NEXT: v_writelane_b32 v62, s27, 7 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[12:13], 8 ; SI-NEXT: v_writelane_b32 v62, s26, 4 ; SI-NEXT: v_writelane_b32 v62, s27, 5 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 24 ; SI-NEXT: v_writelane_b32 v62, s26, 2 ; SI-NEXT: v_writelane_b32 v62, s27, 3 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 ; SI-NEXT: v_writelane_b32 v62, s26, 0 -; SI-NEXT: s_lshr_b64 s[48:49], s[4:5], 16 ; SI-NEXT: v_writelane_b32 v62, s27, 1 +; SI-NEXT: s_lshr_b64 s[48:49], s[14:15], 8 ; SI-NEXT: s_lshr_b64 s[50:51], s[16:17], 24 ; SI-NEXT: s_lshr_b64 s[52:53], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[54:55], s[16:17], 8 @@ -117518,176 +117031,162 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: s_lshr_b64 s[38:39], s[56:57], 8 ; SI-NEXT: s_cbranch_execnz .LBB73_4 ; SI-NEXT: .LBB73_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[30:31], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 -; SI-NEXT: v_lshr_b64 v[9:10], v[30:31], 24 -; SI-NEXT: v_lshr_b64 v[10:11], v[7:8], 24 -; SI-NEXT: v_add_f64 v[54:55], s[40:41], 1.0 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[54:55], 24 -; SI-NEXT: v_add_f64 v[42:43], s[42:43], 1.0 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[54:55], 16 -; SI-NEXT: v_add_f64 v[37:38], s[4:5], 1.0 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[42:43], 16 -; SI-NEXT: v_add_f64 v[56:57], s[44:45], 1.0 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 +; SI-NEXT: v_lshr_b64 v[19:20], v[1:2], 24 +; SI-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[42:43], 8 -; SI-NEXT: v_lshr_b64 v[1:2], v[37:38], 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[19:20], v[1:2], 16 +; SI-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[56:57], 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[1:2], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[1:2], 8 +; SI-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[56:57], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[1:2], v[37:38], 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[3:4], 24 +; SI-NEXT: v_add_f64 v[11:12], s[14:15], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[56:57], 8 -; SI-NEXT: v_add_f64 v[35:36], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[19:20], s[46:47], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[1:2], v[35:36], 24 +; SI-NEXT: v_lshr_b64 v[19:20], v[3:4], 16 +; SI-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[19:20], 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[1:2], v[35:36], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[3:4], 8 +; SI-NEXT: v_add_f64 v[15:16], s[18:19], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[19:20], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[1:2], v[35:36], 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[5:6], 24 +; SI-NEXT: v_add_f64 v[27:28], s[22:23], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[19:20], 8 -; SI-NEXT: v_add_f64 v[3:4], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[24:25], s[56:57], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 24 +; SI-NEXT: v_lshr_b64 v[19:20], v[5:6], 16 +; SI-NEXT: v_add_f64 v[33:34], s[24:25], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[24:25], 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[5:6], 8 +; SI-NEXT: v_lshr_b64 v[35:36], v[27:28], 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[24:25], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[1:2], v[3:4], 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[7:8], 24 +; SI-NEXT: v_add_f64 v[41:42], s[42:43], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[24:25], 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[19:20], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[33:34], 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_add_f64 v[47:48], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 -; SI-NEXT: v_lshr_b64 v[1:2], v[47:48], 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_add_f64 v[32:33], s[14:15], 1.0 -; SI-NEXT: v_add_f64 v[28:29], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[19:20], v[7:8], 8 +; SI-NEXT: v_lshr_b64 v[37:38], v[33:34], 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[1:2], v[47:48], 8 -; SI-NEXT: v_lshr_b64 v[17:18], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[9:10], 24 ; SI-NEXT: v_add_f64 v[21:22], s[20:21], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[15:16], v[32:33], 8 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[26:27], v[28:29], 8 -; SI-NEXT: v_readfirstlane_b32 s11, v48 -; SI-NEXT: v_lshr_b64 v[51:52], v[47:48], 16 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshr_b64 v[1:2], v[28:29], 24 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshr_b64 v[16:17], v[28:29], 16 -; SI-NEXT: v_mov_b32_e32 v48, v28 -; SI-NEXT: v_lshr_b64 v[27:28], v[21:22], 8 -; SI-NEXT: v_readfirstlane_b32 s19, v29 -; SI-NEXT: v_lshr_b64 v[28:29], v[30:31], 16 -; SI-NEXT: v_readfirstlane_b32 s23, v31 -; SI-NEXT: v_lshr_b64 v[52:53], v[30:31], 8 -; SI-NEXT: v_readfirstlane_b32 s47, v20 -; SI-NEXT: v_readfirstlane_b32 s43, v43 -; SI-NEXT: v_readfirstlane_b32 s5, v38 -; SI-NEXT: v_lshr_b64 v[38:39], v[42:43], 24 -; SI-NEXT: v_add_f64 v[44:45], s[12:13], 1.0 -; SI-NEXT: v_readfirstlane_b32 s9, v4 -; SI-NEXT: v_lshr_b64 v[58:59], v[44:45], 24 -; SI-NEXT: v_lshr_b64 v[4:5], v[32:33], 16 -; SI-NEXT: v_readfirstlane_b32 s15, v33 -; SI-NEXT: v_readfirstlane_b32 s13, v45 -; SI-NEXT: v_readfirstlane_b32 s7, v36 -; SI-NEXT: v_lshr_b64 v[40:41], v[44:45], 16 -; SI-NEXT: v_lshr_b64 v[59:60], v[44:45], 8 -; SI-NEXT: v_lshr_b64 v[45:46], v[32:33], 24 -; SI-NEXT: v_lshr_b64 v[33:34], v[13:14], 24 -; SI-NEXT: v_lshr_b64 v[5:6], v[13:14], 8 -; SI-NEXT: v_mov_b32_e32 v36, v13 -; SI-NEXT: v_lshr_b64 v[12:13], v[21:22], 24 -; SI-NEXT: v_readfirstlane_b32 s17, v14 -; SI-NEXT: v_lshr_b64 v[13:14], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[54:55], 8 -; SI-NEXT: v_mov_b32_e32 v14, v9 -; SI-NEXT: v_readfirstlane_b32 s57, v25 -; SI-NEXT: v_mov_b32_e32 v25, v12 -; SI-NEXT: v_mov_b32_e32 v12, v5 -; SI-NEXT: v_mov_b32_e32 v5, v16 -; SI-NEXT: v_readfirstlane_b32 s25, v8 -; SI-NEXT: v_readfirstlane_b32 s21, v22 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[9:10], 16 +; SI-NEXT: v_add_f64 v[56:57], s[44:45], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[9:10], 8 +; SI-NEXT: v_lshr_b64 v[38:39], v[41:42], 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[11:12], 24 +; SI-NEXT: v_readfirstlane_b32 s43, v42 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[39:40], v[41:42], 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[11:12], 8 +; SI-NEXT: v_lshr_b64 v[42:43], v[41:42], 8 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[13:14], 24 +; SI-NEXT: v_add_f64 v[50:51], s[40:41], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[13:14], 16 +; SI-NEXT: v_add_f64 v[17:18], s[46:47], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[17:18], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[7:8], 8 -; SI-NEXT: v_mov_b32_e32 v23, v27 -; SI-NEXT: v_mov_b32_e32 v27, v17 -; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_lshr_b64 v[19:20], v[13:14], 8 +; SI-NEXT: v_lshr_b64 v[43:44], v[56:57], 24 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[15:16], 24 +; SI-NEXT: v_add_f64 v[29:30], s[56:57], 1.0 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[56:57], 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[15:16], 8 ; SI-NEXT: v_readfirstlane_b32 s45, v57 -; SI-NEXT: v_readfirstlane_b32 s41, v55 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[21:22], 24 +; SI-NEXT: v_lshr_b64 v[23:24], v[21:22], 16 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[52:53], v[50:51], 24 +; SI-NEXT: v_lshr_b64 v[45:46], v[56:57], 8 +; SI-NEXT: v_lshr_b64 v[57:58], v[17:18], 16 +; SI-NEXT: v_readfirstlane_b32 s47, v18 +; SI-NEXT: v_lshr_b64 v[24:25], v[21:22], 8 +; SI-NEXT: v_lshr_b64 v[53:54], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[46:47], v[17:18], 24 +; SI-NEXT: v_lshr_b64 v[58:59], v[17:18], 8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[18:19], v[29:30], 16 +; SI-NEXT: v_readfirstlane_b32 s57, v30 +; SI-NEXT: v_readfirstlane_b32 s41, v51 +; SI-NEXT: v_readfirstlane_b32 s25, v34 +; SI-NEXT: v_readfirstlane_b32 s23, v28 +; SI-NEXT: v_readfirstlane_b32 s21, v22 +; SI-NEXT: v_readfirstlane_b32 s19, v16 +; SI-NEXT: v_readfirstlane_b32 s17, v14 +; SI-NEXT: v_readfirstlane_b32 s15, v12 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s11, v8 +; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: v_lshr_b64 v[25:26], v[27:28], 24 +; SI-NEXT: v_lshr_b64 v[31:32], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[33:34], 8 +; SI-NEXT: v_lshr_b64 v[54:55], v[50:51], 8 +; SI-NEXT: v_lshr_b64 v[59:60], v[29:30], 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[29:30], 8 ; SI-NEXT: s_lshr_b32 s10, s5, 24 ; SI-NEXT: s_lshr_b32 s12, s5, 16 ; SI-NEXT: s_lshr_b32 s14, s5, 8 @@ -117707,10 +117206,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: s_lshr_b32 s56, s15, 16 ; SI-NEXT: s_lshr_b32 s58, s15, 8 ; SI-NEXT: s_lshr_b32 s59, s17, 24 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mov_b32_e32 v29, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_lshr_b32 s60, s17, 16 ; SI-NEXT: s_lshr_b32 s61, s17, 8 ; SI-NEXT: s_lshr_b32 s62, s19, 24 @@ -117740,169 +117235,151 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: s_lshr_b32 s8, s57, 24 ; SI-NEXT: s_lshr_b32 vcc_lo, s57, 16 ; SI-NEXT: s_lshr_b32 s6, s57, 8 -; SI-NEXT: v_mov_b32_e32 v34, v4 -; SI-NEXT: v_mov_b32_e32 v53, v51 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v31, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v20, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v43, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v46, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v15, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v51, v10 ; SI-NEXT: s_branch .LBB73_5 ; SI-NEXT: .LBB73_3: +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 0 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 1 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 2 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 3 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 4 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 5 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 6 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 7 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 8 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 9 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 10 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 11 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 12 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 13 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 14 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 15 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 16 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 17 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 18 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 19 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 0 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 1 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 2 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 3 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 4 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 5 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 6 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 7 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 8 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 9 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 10 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 11 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 12 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 13 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 14 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 15 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 16 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 17 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 18 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 19 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 20 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 21 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 20 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 22 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 21 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 23 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 22 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 24 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 23 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 25 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 24 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 25 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 27 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 28 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 27 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 29 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 28 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 30 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 29 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 31 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 30 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 32 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 31 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 32 +; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 33 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 33 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr34 @@ -117918,7 +117395,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr86 @@ -117933,124 +117412,73 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; kill: killed $sgpr27 ; SI-NEXT: s_branch .LBB73_2 ; SI-NEXT: .LBB73_4: -; SI-NEXT: v_mov_b32_e32 v1, s38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v2, s70 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v1, s36 -; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v2, s68 +; SI-NEXT: v_mov_b32_e32 v37, s26 ; SI-NEXT: v_readlane_b32 s26, v62, 0 ; SI-NEXT: v_readlane_b32 s27, v62, 1 -; SI-NEXT: v_mov_b32_e32 v17, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 2 ; SI-NEXT: v_readlane_b32 s27, v62, 3 -; SI-NEXT: v_mov_b32_e32 v34, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 4 ; SI-NEXT: v_readlane_b32 s27, v62, 5 -; SI-NEXT: v_mov_b32_e32 v45, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 6 ; SI-NEXT: v_readlane_b32 s27, v62, 7 -; SI-NEXT: v_mov_b32_e32 v59, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 8 ; SI-NEXT: v_readlane_b32 s27, v62, 9 -; SI-NEXT: v_mov_b32_e32 v40, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 10 ; SI-NEXT: v_readlane_b32 s27, v62, 11 -; SI-NEXT: v_mov_b32_e32 v58, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 12 ; SI-NEXT: v_readlane_b32 s27, v62, 13 -; SI-NEXT: v_mov_b32_e32 v9, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 14 ; SI-NEXT: v_readlane_b32 s27, v62, 15 -; SI-NEXT: v_mov_b32_e32 v53, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 16 ; SI-NEXT: v_readlane_b32 s27, v62, 17 -; SI-NEXT: v_mov_b32_e32 v50, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 18 ; SI-NEXT: v_readlane_b32 s27, v62, 19 -; SI-NEXT: v_mov_b32_e32 v29, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 20 ; SI-NEXT: v_readlane_b32 s27, v62, 21 -; SI-NEXT: v_mov_b32_e32 v31, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 22 ; SI-NEXT: v_readlane_b32 s27, v62, 23 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 24 ; SI-NEXT: v_readlane_b32 s27, v62, 25 -; SI-NEXT: v_mov_b32_e32 v43, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 26 ; SI-NEXT: v_readlane_b32 s27, v62, 27 -; SI-NEXT: v_mov_b32_e32 v46, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 28 ; SI-NEXT: v_readlane_b32 s27, v62, 29 -; SI-NEXT: v_mov_b32_e32 v16, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 30 ; SI-NEXT: v_readlane_b32 s27, v62, 31 -; SI-NEXT: v_mov_b32_e32 v3, s8 -; SI-NEXT: v_mov_b32_e32 v51, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 32 ; SI-NEXT: v_readlane_b32 s27, v62, 33 -; SI-NEXT: v_mov_b32_e32 v38, s72 -; SI-NEXT: v_mov_b32_e32 v49, s62 -; SI-NEXT: v_mov_b32_e32 v22, s28 -; SI-NEXT: v_mov_b32_e32 v24, s56 -; SI-NEXT: v_mov_b32_e32 v19, s46 +; SI-NEXT: v_mov_b32_e32 v29, s56 +; SI-NEXT: v_mov_b32_e32 v17, s46 ; SI-NEXT: v_mov_b32_e32 v56, s44 -; SI-NEXT: v_mov_b32_e32 v42, s42 -; SI-NEXT: v_mov_b32_e32 v54, s40 -; SI-NEXT: v_mov_b32_e32 v7, s24 -; SI-NEXT: v_mov_b32_e32 v30, s22 +; SI-NEXT: v_mov_b32_e32 v41, s42 +; SI-NEXT: v_mov_b32_e32 v50, s40 +; SI-NEXT: v_mov_b32_e32 v33, s24 +; SI-NEXT: v_mov_b32_e32 v27, s22 ; SI-NEXT: v_mov_b32_e32 v21, s20 -; SI-NEXT: v_mov_b32_e32 v48, s18 -; SI-NEXT: v_mov_b32_e32 v36, s16 -; SI-NEXT: v_mov_b32_e32 v32, s14 -; SI-NEXT: v_mov_b32_e32 v44, s12 -; SI-NEXT: v_mov_b32_e32 v47, s10 -; SI-NEXT: v_mov_b32_e32 v35, s6 -; SI-NEXT: v_mov_b32_e32 v37, s4 -; SI-NEXT: v_mov_b32_e32 v52, s96 -; SI-NEXT: v_mov_b32_e32 v28, s86 -; SI-NEXT: v_mov_b32_e32 v14, s84 -; SI-NEXT: v_mov_b32_e32 v23, s82 -; SI-NEXT: v_mov_b32_e32 v27, s80 -; SI-NEXT: v_mov_b32_e32 v25, s70 -; SI-NEXT: v_mov_b32_e32 v26, s68 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v1, s34 -; SI-NEXT: v_mov_b32_e32 v5, s66 -; SI-NEXT: v_mov_b32_e32 v12, s54 -; SI-NEXT: v_mov_b32_e32 v33, s50 -; SI-NEXT: v_mov_b32_e32 v8, s48 -; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_mov_b32_e32 v11, s14 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_mov_b32_e32 v19, s38 +; SI-NEXT: v_mov_b32_e32 v18, s36 +; SI-NEXT: v_mov_b32_e32 v59, s34 +; SI-NEXT: v_mov_b32_e32 v58, s30 +; SI-NEXT: v_mov_b32_e32 v57, s94 +; SI-NEXT: v_mov_b32_e32 v46, s92 +; SI-NEXT: v_mov_b32_e32 v45, s90 +; SI-NEXT: v_mov_b32_e32 v44, s88 +; SI-NEXT: v_mov_b32_e32 v43, s78 +; SI-NEXT: v_mov_b32_e32 v42, s76 +; SI-NEXT: v_mov_b32_e32 v39, s74 +; SI-NEXT: v_mov_b32_e32 v38, s72 +; SI-NEXT: v_mov_b32_e32 v54, s62 +; SI-NEXT: v_mov_b32_e32 v53, s60 +; SI-NEXT: v_mov_b32_e32 v52, s58 +; SI-NEXT: v_mov_b32_e32 v48, s28 +; SI-NEXT: v_mov_b32_e32 v36, s98 +; SI-NEXT: v_mov_b32_e32 v35, s96 +; SI-NEXT: v_mov_b32_e32 v31, s86 +; SI-NEXT: v_mov_b32_e32 v25, s84 +; SI-NEXT: v_mov_b32_e32 v24, s82 +; SI-NEXT: v_mov_b32_e32 v23, s80 ; SI-NEXT: v_readlane_b32 s10, v62, 34 ; SI-NEXT: v_readlane_b32 s12, v62, 35 ; SI-NEXT: v_readlane_b32 s14, v62, 36 @@ -118059,7 +117487,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_readlane_b32 s20, v62, 39 ; SI-NEXT: v_readlane_b32 s22, v62, 40 ; SI-NEXT: v_readlane_b32 s24, v62, 41 -; SI-NEXT: v_readlane_b32 s26, v62, 42 ; SI-NEXT: v_readlane_b32 s27, v62, 43 ; SI-NEXT: v_readlane_b32 s28, v62, 44 ; SI-NEXT: v_readlane_b32 s29, v62, 45 @@ -118068,19 +117495,33 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_readlane_b32 s44, v62, 48 ; SI-NEXT: v_readlane_b32 s46, v62, 49 ; SI-NEXT: v_readlane_b32 s56, v62, 50 +; SI-NEXT: v_readlane_b32 s58, v62, 51 ; SI-NEXT: v_readlane_b32 s59, v62, 52 +; SI-NEXT: v_readlane_b32 s60, v62, 53 ; SI-NEXT: v_readlane_b32 s61, v62, 54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s66 ; SI-NEXT: v_readlane_b32 s62, v62, 55 ; SI-NEXT: v_readlane_b32 s63, v62, 56 ; SI-NEXT: v_readlane_b32 s72, v62, 57 ; SI-NEXT: v_readlane_b32 s73, v62, 58 +; SI-NEXT: v_readlane_b32 s74, v62, 59 ; SI-NEXT: v_readlane_b32 s75, v62, 60 +; SI-NEXT: v_readlane_b32 s76, v62, 61 ; SI-NEXT: v_readlane_b32 s77, v62, 62 +; SI-NEXT: v_readlane_b32 s78, v62, 63 ; SI-NEXT: v_readlane_b32 s79, v61, 0 +; SI-NEXT: v_readlane_b32 s88, v61, 1 ; SI-NEXT: v_readlane_b32 s89, v61, 2 +; SI-NEXT: v_readlane_b32 s90, v61, 3 ; SI-NEXT: v_readlane_b32 s91, v61, 4 +; SI-NEXT: v_readlane_b32 s92, v61, 5 ; SI-NEXT: v_readlane_b32 s93, v61, 6 +; SI-NEXT: v_readlane_b32 s94, v61, 7 ; SI-NEXT: v_readlane_b32 s95, v61, 8 +; SI-NEXT: v_readlane_b32 s30, v61, 9 ; SI-NEXT: v_readlane_b32 s31, v61, 10 ; SI-NEXT: v_readlane_b32 s34, v61, 11 ; SI-NEXT: v_readlane_b32 s35, v61, 12 @@ -118089,307 +117530,278 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_readlane_b32 s8, v61, 15 ; SI-NEXT: v_readlane_b32 vcc_lo, v61, 16 ; SI-NEXT: v_readlane_b32 s6, v61, 17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v1, s30 -; SI-NEXT: v_readlane_b32 s30, v61, 9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, s64 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v1, s94 -; SI-NEXT: v_readlane_b32 s94, v61, 7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, s54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v1, s92 -; SI-NEXT: v_readlane_b32 s92, v61, 5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, s52 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v1, s90 -; SI-NEXT: v_readlane_b32 s90, v61, 3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, s50 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v1, s88 -; SI-NEXT: v_readlane_b32 s88, v61, 1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, s48 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v1, s78 -; SI-NEXT: v_readlane_b32 s78, v62, 63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v1, s76 -; SI-NEXT: v_readlane_b32 s76, v62, 61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v1, s74 -; SI-NEXT: v_readlane_b32 s74, v62, 59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v1, s60 -; SI-NEXT: v_readlane_b32 s60, v62, 53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v1, s58 -; SI-NEXT: v_readlane_b32 s58, v62, 51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v1, s98 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s52 -; SI-NEXT: v_mov_b32_e32 v1, s64 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 42 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: .LBB73_5: ; %end -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v19 ; SI-NEXT: s_and_b32 s4, s57, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, vcc_lo, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v59 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s8, s6 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_lshl_b32 s6, s37, 8 -; SI-NEXT: s_lshl_b32 s8, s35, 24 -; SI-NEXT: v_readlane_b32 s99, v63, 35 -; SI-NEXT: v_readlane_b32 s98, v63, 34 -; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s96, v63, 32 -; SI-NEXT: v_readlane_b32 s87, v63, 31 -; SI-NEXT: v_readlane_b32 s86, v63, 30 -; SI-NEXT: v_readlane_b32 s85, v63, 29 -; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: v_readlane_b32 s83, v63, 27 -; SI-NEXT: v_readlane_b32 s82, v63, 26 -; SI-NEXT: v_readlane_b32 s81, v63, 25 -; SI-NEXT: v_readlane_b32 s80, v63, 24 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s68, v63, 20 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s54, v63, 14 -; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s52, v63, 12 -; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: v_readlane_b32 s39, v63, 7 -; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v10 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v19 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v58 ; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s6, s37, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v57 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s36, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v46 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s8, s6 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_lshl_b32 s6, s34, 8 -; SI-NEXT: s_lshl_b32 s8, s30, 24 -; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: s_lshl_b32 s8, s35, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v18 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v45 ; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_lshl_b32 s6, s34, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v44 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s31, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v43 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s8, s6 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_lshl_b32 s6, s95, 8 -; SI-NEXT: s_lshl_b32 s8, s93, 24 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: s_lshl_b32 s8, s30, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v55 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v42 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v41 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v42 ; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s6, s95, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v39 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s94, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v38 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s6, s8, s6 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_lshl_b32 s6, s92, 8 -; SI-NEXT: s_lshl_b32 s8, s90, 24 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: s_lshl_b32 s8, s93, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v50 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v49 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v54 ; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s6, s92, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v53 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s91, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v52 ; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s8, s90, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s8, s6 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_lshl_b32 s6, s89, 8 -; SI-NEXT: s_lshl_b32 s8, s79, 24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v38 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v48 ; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s6, s89, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v37 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s88, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v36 ; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s8, s79, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s8, s6 -; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_lshl_b32 s6, s78, 8 -; SI-NEXT: s_lshl_b32 s8, s76, 24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v35 ; SI-NEXT: s_and_b32 s4, s23, 0xff +; SI-NEXT: s_lshl_b32 s6, s78, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v31 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s77, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v14 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v25 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: s_lshl_b32 s8, s76, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -118400,80 +117812,131 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v24 ; SI-NEXT: s_and_b32 s4, s21, 0xff ; SI-NEXT: s_lshl_b32 s6, s75, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v23 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s74, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v25 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s8, s73, 24 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s72, 8 +; SI-NEXT: s_lshl_b32 s8, s62, 24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v16 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v48 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s6, s72, 8 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v5 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s63, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s62, 24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s61, 8 +; SI-NEXT: s_lshl_b32 s8, s59, 24 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v14 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s6, s61, 8 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s60, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s59, 24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_lshl_b32 s6, s58, 8 ; SI-NEXT: s_lshl_b32 s8, s46, 24 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v20 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v12 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 @@ -118482,147 +117945,199 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v32 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v17 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s15, 0xff -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v34 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s56, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v45 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s44, 8 +; SI-NEXT: s_lshl_b32 s8, s40, 24 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v10 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v59 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s6, s44, 8 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v40 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s42, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v58 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s40, 24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v8 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s6, s29, 8 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v53 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s28, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v50 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s26, 8 +; SI-NEXT: s_lshl_b32 s8, s22, 24 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v29 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s6, s26, 8 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v31 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s24, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s22, 24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s20, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: s_lshl_b32 s6, s20, 8 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v46 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v16 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s16, 24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_lshl_b32 s6, s10, 24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s5, 0xff ; SI-NEXT: s_lshl_b32 s5, s14, 8 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s12, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s10, 24 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt expcnt(0) @@ -118642,9 +118157,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -118653,8 +118168,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v63, s30, 0 ; VI-NEXT: v_writelane_b32 v63, s31, 1 @@ -118712,15 +118227,15 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readfirstlane_b32 s25, v20 ; VI-NEXT: v_mov_b32_e32 v20, s28 ; VI-NEXT: v_writelane_b32 v63, s86, 30 -; VI-NEXT: v_readfirstlane_b32 s20, v20 +; VI-NEXT: v_readfirstlane_b32 s22, v20 ; VI-NEXT: v_mov_b32_e32 v20, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; VI-NEXT: v_writelane_b32 v63, s87, 31 -; VI-NEXT: v_readfirstlane_b32 s21, v20 -; VI-NEXT: v_readfirstlane_b32 s22, v1 -; VI-NEXT: v_readfirstlane_b32 s23, v2 -; VI-NEXT: v_readfirstlane_b32 s18, v3 -; VI-NEXT: v_readfirstlane_b32 s19, v4 +; VI-NEXT: v_readfirstlane_b32 s23, v20 +; VI-NEXT: v_readfirstlane_b32 s18, v1 +; VI-NEXT: v_readfirstlane_b32 s19, v2 +; VI-NEXT: v_readfirstlane_b32 s20, v3 +; VI-NEXT: v_readfirstlane_b32 s21, v4 ; VI-NEXT: v_readfirstlane_b32 s16, v5 ; VI-NEXT: v_readfirstlane_b32 s17, v6 ; VI-NEXT: v_readfirstlane_b32 s14, v7 @@ -118731,11 +118246,11 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readfirstlane_b32 s11, v12 ; VI-NEXT: v_readfirstlane_b32 s8, v13 ; VI-NEXT: v_readfirstlane_b32 s9, v14 -; VI-NEXT: v_readfirstlane_b32 s4, v15 -; VI-NEXT: v_readfirstlane_b32 s5, v16 -; VI-NEXT: v_readfirstlane_b32 s6, v17 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v16 +; VI-NEXT: v_readfirstlane_b32 s4, v17 ; VI-NEXT: s_and_b64 s[26:27], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s7, v18 +; VI-NEXT: v_readfirstlane_b32 s5, v18 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill @@ -118753,226 +118268,224 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; VI-NEXT: s_cbranch_scc0 .LBB73_3 ; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_lshr_b32 s26, s5, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 19 +; VI-NEXT: s_lshr_b32 s26, s5, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 20 +; VI-NEXT: s_lshr_b32 s26, s5, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 21 +; VI-NEXT: s_lshr_b32 s26, s4, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 17 +; VI-NEXT: s_lshr_b32 s26, s4, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 18 ; VI-NEXT: s_lshr_b32 s26, s7, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 13 +; VI-NEXT: v_writelane_b32 v62, s26, 22 ; VI-NEXT: s_lshr_b32 s26, s7, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 14 +; VI-NEXT: v_writelane_b32 v62, s26, 23 ; VI-NEXT: s_lshr_b32 s26, s7, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 15 +; VI-NEXT: v_writelane_b32 v62, s26, 24 ; VI-NEXT: s_lshr_b32 s26, s6, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 11 +; VI-NEXT: v_writelane_b32 v62, s26, 15 ; VI-NEXT: s_lshr_b32 s26, s6, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 12 -; VI-NEXT: s_lshr_b32 s26, s5, 24 ; VI-NEXT: v_writelane_b32 v62, s26, 16 -; VI-NEXT: s_lshr_b32 s26, s5, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 17 -; VI-NEXT: s_lshr_b32 s26, s5, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 18 -; VI-NEXT: s_lshr_b32 s26, s4, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 9 -; VI-NEXT: s_lshr_b32 s26, s4, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 10 ; VI-NEXT: s_lshr_b32 s26, s9, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 19 +; VI-NEXT: v_writelane_b32 v62, s26, 25 ; VI-NEXT: s_lshr_b32 s26, s9, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 20 +; VI-NEXT: v_writelane_b32 v62, s26, 26 ; VI-NEXT: s_lshr_b32 s26, s9, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 21 +; VI-NEXT: v_writelane_b32 v62, s26, 27 ; VI-NEXT: s_lshr_b32 s26, s8, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 7 +; VI-NEXT: v_writelane_b32 v62, s26, 13 ; VI-NEXT: s_lshr_b32 s26, s8, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 14 ; VI-NEXT: s_lshr_b32 s26, s11, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 22 +; VI-NEXT: v_writelane_b32 v62, s26, 28 ; VI-NEXT: s_lshr_b32 s26, s11, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 23 +; VI-NEXT: v_writelane_b32 v62, s26, 29 ; VI-NEXT: s_lshr_b32 s26, s11, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 30 ; VI-NEXT: s_lshr_b32 s26, s10, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 5 +; VI-NEXT: v_writelane_b32 v62, s26, 11 ; VI-NEXT: s_lshr_b32 s26, s10, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 6 +; VI-NEXT: v_writelane_b32 v62, s26, 12 ; VI-NEXT: s_lshr_b32 s26, s13, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 25 +; VI-NEXT: v_writelane_b32 v62, s26, 31 ; VI-NEXT: s_lshr_b32 s26, s13, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 26 +; VI-NEXT: v_writelane_b32 v62, s26, 32 ; VI-NEXT: s_lshr_b32 s26, s13, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 27 +; VI-NEXT: v_writelane_b32 v62, s26, 33 ; VI-NEXT: s_lshr_b32 s26, s12, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 3 +; VI-NEXT: v_writelane_b32 v62, s26, 9 ; VI-NEXT: s_lshr_b32 s26, s12, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 4 +; VI-NEXT: v_writelane_b32 v62, s26, 10 ; VI-NEXT: s_lshr_b32 s26, s15, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 28 +; VI-NEXT: v_writelane_b32 v62, s26, 34 ; VI-NEXT: s_lshr_b32 s26, s15, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 29 +; VI-NEXT: v_writelane_b32 v62, s26, 35 ; VI-NEXT: s_lshr_b32 s26, s15, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 30 +; VI-NEXT: v_writelane_b32 v62, s26, 36 ; VI-NEXT: s_lshr_b32 s26, s14, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 1 +; VI-NEXT: v_writelane_b32 v62, s26, 7 ; VI-NEXT: s_lshr_b32 s26, s14, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 2 +; VI-NEXT: v_writelane_b32 v62, s26, 8 ; VI-NEXT: s_lshr_b32 s26, s17, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 31 -; VI-NEXT: s_lshr_b32 s26, s17, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 32 -; VI-NEXT: s_lshr_b32 s26, s17, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 33 -; VI-NEXT: s_lshr_b32 s26, s16, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 0 -; VI-NEXT: s_lshr_b32 s26, s19, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 34 -; VI-NEXT: s_lshr_b32 s26, s19, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 35 -; VI-NEXT: s_lshr_b32 s26, s19, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 36 -; VI-NEXT: s_lshr_b32 s26, s23, 24 ; VI-NEXT: v_writelane_b32 v62, s26, 37 -; VI-NEXT: s_lshr_b32 s26, s23, 16 +; VI-NEXT: s_lshr_b32 s26, s17, 16 ; VI-NEXT: v_writelane_b32 v62, s26, 38 -; VI-NEXT: s_lshr_b32 s26, s23, 8 +; VI-NEXT: s_lshr_b32 s26, s17, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 39 +; VI-NEXT: s_lshr_b32 s26, s16, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 5 +; VI-NEXT: s_lshr_b32 s26, s16, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 6 ; VI-NEXT: s_lshr_b32 s26, s21, 24 ; VI-NEXT: v_writelane_b32 v62, s26, 40 ; VI-NEXT: s_lshr_b32 s26, s21, 16 ; VI-NEXT: v_writelane_b32 v62, s26, 41 ; VI-NEXT: s_lshr_b32 s26, s21, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 42 -; VI-NEXT: s_lshr_b32 s26, s25, 24 +; VI-NEXT: s_lshr_b32 s26, s20, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 3 +; VI-NEXT: s_lshr_b32 s26, s20, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 4 +; VI-NEXT: s_lshr_b32 s26, s19, 24 ; VI-NEXT: v_writelane_b32 v62, s26, 43 -; VI-NEXT: s_lshr_b32 s26, s25, 16 +; VI-NEXT: s_lshr_b32 s26, s19, 16 ; VI-NEXT: v_writelane_b32 v62, s26, 44 -; VI-NEXT: s_lshr_b32 s26, s25, 8 +; VI-NEXT: s_lshr_b32 s26, s19, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 45 -; VI-NEXT: s_lshr_b32 s26, s41, 24 +; VI-NEXT: s_lshr_b32 s26, s18, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 1 +; VI-NEXT: s_lshr_b32 s26, s18, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 2 +; VI-NEXT: s_lshr_b32 s26, s23, 24 ; VI-NEXT: v_writelane_b32 v62, s26, 46 -; VI-NEXT: s_lshr_b32 s26, s41, 16 +; VI-NEXT: s_lshr_b32 s26, s23, 16 ; VI-NEXT: v_writelane_b32 v62, s26, 47 -; VI-NEXT: s_lshr_b32 s26, s41, 8 +; VI-NEXT: s_lshr_b32 s26, s23, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 48 -; VI-NEXT: s_lshr_b32 s26, s43, 24 +; VI-NEXT: s_lshr_b32 s26, s22, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 0 +; VI-NEXT: s_lshr_b32 s26, s25, 24 ; VI-NEXT: v_writelane_b32 v62, s26, 49 -; VI-NEXT: s_lshr_b32 s26, s43, 16 +; VI-NEXT: s_lshr_b32 s26, s25, 16 ; VI-NEXT: v_writelane_b32 v62, s26, 50 -; VI-NEXT: s_lshr_b32 s26, s43, 8 +; VI-NEXT: s_lshr_b32 s26, s25, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 51 -; VI-NEXT: s_lshr_b32 s26, s45, 24 +; VI-NEXT: s_lshr_b32 s26, s41, 24 ; VI-NEXT: v_writelane_b32 v62, s26, 52 -; VI-NEXT: s_lshr_b32 s26, s45, 16 +; VI-NEXT: s_lshr_b32 s26, s41, 16 ; VI-NEXT: v_writelane_b32 v62, s26, 53 -; VI-NEXT: s_lshr_b32 s26, s45, 8 +; VI-NEXT: s_lshr_b32 s26, s41, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 54 -; VI-NEXT: s_lshr_b32 s26, s47, 24 +; VI-NEXT: s_lshr_b32 s26, s43, 24 ; VI-NEXT: v_writelane_b32 v62, s26, 55 -; VI-NEXT: s_lshr_b32 s26, s47, 16 +; VI-NEXT: s_lshr_b32 s26, s43, 16 ; VI-NEXT: v_writelane_b32 v62, s26, 56 -; VI-NEXT: s_lshr_b32 s26, s47, 8 -; VI-NEXT: s_lshr_b32 s82, s16, 8 -; VI-NEXT: s_lshr_b32 s83, s18, 16 -; VI-NEXT: s_lshr_b32 s55, s18, 8 -; VI-NEXT: s_lshr_b32 s65, s22, 16 -; VI-NEXT: s_lshr_b32 s64, s22, 8 -; VI-NEXT: s_lshr_b32 s85, s20, 16 -; VI-NEXT: s_lshr_b32 s84, s20, 8 -; VI-NEXT: s_lshr_b32 s67, s24, 16 -; VI-NEXT: s_lshr_b32 s66, s24, 8 -; VI-NEXT: s_lshr_b32 s69, s40, 16 -; VI-NEXT: s_lshr_b32 s68, s40, 8 -; VI-NEXT: s_lshr_b32 s71, s42, 16 -; VI-NEXT: s_lshr_b32 s70, s42, 8 -; VI-NEXT: s_lshr_b32 s87, s44, 16 -; VI-NEXT: s_lshr_b32 s86, s44, 8 +; VI-NEXT: s_lshr_b32 s26, s43, 8 +; VI-NEXT: s_lshr_b32 s68, s22, 16 +; VI-NEXT: s_lshr_b32 s66, s24, 16 +; VI-NEXT: s_lshr_b32 s65, s24, 8 +; VI-NEXT: s_lshr_b32 s67, s40, 16 +; VI-NEXT: s_lshr_b32 s80, s40, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 57 -; VI-NEXT: s_lshr_b32 s81, s46, 16 -; VI-NEXT: s_lshr_b32 s80, s46, 8 +; VI-NEXT: s_lshr_b32 s81, s42, 16 +; VI-NEXT: s_lshr_b32 s69, s42, 8 +; VI-NEXT: s_lshr_b32 s83, s45, 24 +; VI-NEXT: s_lshr_b32 s84, s45, 16 +; VI-NEXT: s_lshr_b32 s85, s45, 8 +; VI-NEXT: s_lshr_b32 s54, s44, 16 +; VI-NEXT: s_lshr_b32 s82, s44, 8 +; VI-NEXT: s_lshr_b32 s86, s47, 24 +; VI-NEXT: s_lshr_b32 s87, s47, 16 +; VI-NEXT: s_lshr_b32 s50, s47, 8 +; VI-NEXT: s_lshr_b32 s55, s46, 16 +; VI-NEXT: s_lshr_b32 s70, s46, 8 ; VI-NEXT: s_lshr_b32 s51, s57, 24 ; VI-NEXT: s_lshr_b32 s52, s57, 16 ; VI-NEXT: s_lshr_b32 s53, s57, 8 -; VI-NEXT: s_lshr_b32 s54, s56, 16 -; VI-NEXT: s_lshr_b32 s50, s56, 8 -; VI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 -; VI-NEXT: s_lshr_b64 s[28:29], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 -; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; VI-NEXT: s_lshr_b32 s64, s56, 16 +; VI-NEXT: s_lshr_b32 s71, s56, 8 +; VI-NEXT: s_lshr_b64 s[48:49], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[6:7], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 ; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[46:47], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[56:57], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[60:61], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[28:29], s[46:47], 24 +; VI-NEXT: s_lshr_b64 s[26:27], s[56:57], 24 ; VI-NEXT: s_cbranch_execnz .LBB73_4 ; VI-NEXT: .LBB73_2: ; %cmp.true -; VI-NEXT: v_add_f64 v[1:2], s[6:7], 1.0 -; VI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 -; VI-NEXT: v_add_f64 v[23:24], s[18:19], 1.0 -; VI-NEXT: v_add_f64 v[25:26], s[20:21], 1.0 -; VI-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 +; VI-NEXT: v_add_f64 v[11:12], s[14:15], 1.0 +; VI-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; VI-NEXT: v_add_f64 v[19:20], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[23:24], s[22:23], 1.0 ; VI-NEXT: v_add_f64 v[27:28], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 ; VI-NEXT: v_add_f64 v[31:32], s[40:41], 1.0 -; VI-NEXT: v_add_f64 v[33:34], s[42:43], 1.0 -; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[1:2] -; VI-NEXT: v_add_f64 v[19:20], s[22:23], 1.0 -; VI-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 -; VI-NEXT: v_add_f64 v[37:38], s[44:45], 1.0 -; VI-NEXT: v_add_f64 v[17:18], s[16:17], 1.0 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_add_f64 v[52:53], s[56:57], 1.0 +; VI-NEXT: v_add_f64 v[35:36], s[42:43], 1.0 +; VI-NEXT: v_add_f64 v[48:49], s[44:45], 1.0 +; VI-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 ; VI-NEXT: v_add_f64 v[50:51], s[46:47], 1.0 -; VI-NEXT: v_add_f64 v[13:14], s[14:15], 1.0 -; VI-NEXT: v_add_f64 v[11:12], s[12:13], 1.0 -; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[3:4] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[11:12] +; VI-NEXT: v_add_f64 v[52:53], s[56:57], 1.0 +; VI-NEXT: v_add_f64 v[17:18], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 +; VI-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[38:39], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20] +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] -; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26] -; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[27:28] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[31:32] -; VI-NEXT: v_readfirstlane_b32 s11, v8 -; VI-NEXT: v_lshrrev_b64 v[29:30], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[19:20] -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[33:34] -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[37:38] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[27:28] +; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[31:32] +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[35:36] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[48:49] +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[50:51] ; VI-NEXT: v_readfirstlane_b32 s57, v53 ; VI-NEXT: v_readfirstlane_b32 s47, v51 -; VI-NEXT: v_readfirstlane_b32 s45, v38 -; VI-NEXT: v_readfirstlane_b32 s43, v34 +; VI-NEXT: v_readfirstlane_b32 s45, v49 +; VI-NEXT: v_readfirstlane_b32 s43, v36 ; VI-NEXT: v_readfirstlane_b32 s41, v32 ; VI-NEXT: v_readfirstlane_b32 s25, v28 -; VI-NEXT: v_readfirstlane_b32 s21, v26 -; VI-NEXT: v_readfirstlane_b32 s23, v20 -; VI-NEXT: v_readfirstlane_b32 s19, v24 -; VI-NEXT: v_readfirstlane_b32 s17, v18 -; VI-NEXT: v_readfirstlane_b32 s15, v14 -; VI-NEXT: v_readfirstlane_b32 s13, v12 +; VI-NEXT: v_readfirstlane_b32 s23, v24 +; VI-NEXT: v_readfirstlane_b32 s19, v20 +; VI-NEXT: v_readfirstlane_b32 s21, v18 +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s15, v12 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s11, v8 ; VI-NEXT: v_readfirstlane_b32 s9, v6 -; VI-NEXT: v_readfirstlane_b32 s5, v4 -; VI-NEXT: v_readfirstlane_b32 s7, v2 -; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14] -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[50:51] -; VI-NEXT: v_lshrrev_b64 v[55:56], 24, v[52:53] -; VI-NEXT: s_lshr_b32 s10, s7, 24 -; VI-NEXT: s_lshr_b32 s12, s7, 16 -; VI-NEXT: s_lshr_b32 s14, s7, 8 +; VI-NEXT: v_readfirstlane_b32 s7, v4 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[29:30], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[52:53] +; VI-NEXT: s_lshr_b32 s10, s5, 24 +; VI-NEXT: s_lshr_b32 s12, s5, 16 +; VI-NEXT: s_lshr_b32 s14, s5, 8 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; VI-NEXT: s_lshr_b32 s16, s5, 24 -; VI-NEXT: s_lshr_b32 s18, s5, 16 -; VI-NEXT: s_lshr_b32 s20, s5, 8 +; VI-NEXT: s_lshr_b32 s16, s7, 24 +; VI-NEXT: s_lshr_b32 s18, s7, 16 +; VI-NEXT: s_lshr_b32 s20, s7, 8 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; VI-NEXT: s_lshr_b32 s22, s9, 24 @@ -118988,38 +118501,38 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: s_lshr_b32 s40, s13, 24 ; VI-NEXT: s_lshr_b32 s42, s13, 16 ; VI-NEXT: s_lshr_b32 s44, s13, 8 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v9 ; VI-NEXT: s_lshr_b32 s46, s15, 24 ; VI-NEXT: s_lshr_b32 s56, s15, 16 ; VI-NEXT: s_lshr_b32 s58, s15, 8 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v11 ; VI-NEXT: s_lshr_b32 s59, s17, 24 ; VI-NEXT: s_lshr_b32 s60, s17, 16 ; VI-NEXT: s_lshr_b32 s61, s17, 8 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v17 -; VI-NEXT: s_lshr_b32 s62, s19, 24 -; VI-NEXT: s_lshr_b32 s63, s19, 16 -; VI-NEXT: s_lshr_b32 s72, s19, 8 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v23 -; VI-NEXT: s_lshr_b32 s73, s23, 24 -; VI-NEXT: s_lshr_b32 s74, s23, 16 -; VI-NEXT: s_lshr_b32 s75, s23, 8 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v13 +; VI-NEXT: s_lshr_b32 s62, s21, 24 +; VI-NEXT: s_lshr_b32 s63, s21, 16 +; VI-NEXT: s_lshr_b32 s72, s21, 8 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v17 +; VI-NEXT: s_lshr_b32 s73, s19, 24 +; VI-NEXT: s_lshr_b32 s74, s19, 16 +; VI-NEXT: s_lshr_b32 s75, s19, 8 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 ; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v19 -; VI-NEXT: s_lshr_b32 s76, s21, 24 -; VI-NEXT: s_lshr_b32 s77, s21, 16 -; VI-NEXT: s_lshr_b32 s78, s21, 8 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v25 +; VI-NEXT: s_lshr_b32 s76, s23, 24 +; VI-NEXT: s_lshr_b32 s77, s23, 16 +; VI-NEXT: s_lshr_b32 s78, s23, 8 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v23 ; VI-NEXT: s_lshr_b32 s79, s25, 24 ; VI-NEXT: s_lshr_b32 s88, s25, 16 ; VI-NEXT: s_lshr_b32 s89, s25, 8 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v27 ; VI-NEXT: s_lshr_b32 s90, s41, 24 ; VI-NEXT: s_lshr_b32 s91, s41, 16 ; VI-NEXT: s_lshr_b32 s30, s41, 8 @@ -119028,563 +118541,526 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: s_lshr_b32 s31, s43, 24 ; VI-NEXT: s_lshr_b32 s34, s43, 16 ; VI-NEXT: s_lshr_b32 s35, s43, 8 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v33 -; VI-NEXT: s_lshr_b32 s36, s45, 24 -; VI-NEXT: s_lshr_b32 s37, s45, 16 -; VI-NEXT: s_lshr_b32 s38, s45, 8 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v37 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v37 -; VI-NEXT: s_lshr_b32 s39, s47, 24 -; VI-NEXT: s_lshr_b32 s48, s47, 16 -; VI-NEXT: s_lshr_b32 s49, s47, 8 -; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v50 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v50 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v35 +; VI-NEXT: s_lshr_b32 s83, s45, 24 +; VI-NEXT: s_lshr_b32 s84, s45, 16 +; VI-NEXT: s_lshr_b32 s85, s45, 8 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v48 +; VI-NEXT: s_lshr_b32 s86, s47, 24 +; VI-NEXT: s_lshr_b32 s87, s47, 16 +; VI-NEXT: s_lshr_b32 s50, s47, 8 ; VI-NEXT: s_lshr_b32 s51, s57, 24 ; VI-NEXT: s_lshr_b32 s52, s57, 16 ; VI-NEXT: s_lshr_b32 s53, s57, 8 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v52 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v48 +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v50 +; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v50 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v52 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v52 ; VI-NEXT: s_branch .LBB73_5 ; VI-NEXT: .LBB73_3: +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr64 ; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr50 -; VI-NEXT: ; implicit-def: $sgpr54 ; VI-NEXT: ; implicit-def: $sgpr53 ; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr51 -; VI-NEXT: ; implicit-def: $sgpr80 -; VI-NEXT: ; implicit-def: $sgpr81 -; VI-NEXT: ; implicit-def: $sgpr86 -; VI-NEXT: ; implicit-def: $sgpr87 ; VI-NEXT: ; implicit-def: $sgpr70 -; VI-NEXT: ; implicit-def: $sgpr71 -; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr83 ; VI-NEXT: ; implicit-def: $sgpr69 -; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr80 ; VI-NEXT: ; implicit-def: $sgpr67 -; VI-NEXT: ; implicit-def: $sgpr84 -; VI-NEXT: ; implicit-def: $sgpr85 -; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr55 -; VI-NEXT: ; implicit-def: $sgpr83 -; VI-NEXT: ; implicit-def: $sgpr82 -; VI-NEXT: ; implicit-def: $sgpr48 -; VI-NEXT: ; implicit-def: $sgpr38 -; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr30 -; VI-NEXT: ; implicit-def: $sgpr90 -; VI-NEXT: ; implicit-def: $sgpr88 -; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr28 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 +; VI-NEXT: ; implicit-def: $sgpr27 +; VI-NEXT: ; kill: killed $sgpr27 ; VI-NEXT: s_branch .LBB73_2 ; VI-NEXT: .LBB73_4: -; VI-NEXT: v_mov_b32_e32 v40, s76 -; VI-NEXT: v_mov_b32_e32 v39, s58 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v39, s28 -; VI-NEXT: v_readlane_b32 s27, v62, 0 -; VI-NEXT: v_mov_b32_e32 v26, s27 -; VI-NEXT: v_readlane_b32 s27, v62, 1 -; VI-NEXT: v_mov_b32_e32 v22, s27 -; VI-NEXT: v_readlane_b32 s27, v62, 2 -; VI-NEXT: v_mov_b32_e32 v24, s27 -; VI-NEXT: v_readlane_b32 s27, v62, 3 -; VI-NEXT: v_mov_b32_e32 v18, s27 -; VI-NEXT: v_readlane_b32 s27, v62, 4 -; VI-NEXT: v_mov_b32_e32 v20, s27 -; VI-NEXT: v_readlane_b32 s27, v62, 5 -; VI-NEXT: v_mov_b32_e32 v14, s27 -; VI-NEXT: v_readlane_b32 s27, v62, 6 -; VI-NEXT: v_mov_b32_e32 v16, s27 -; VI-NEXT: v_readlane_b32 s27, v62, 7 -; VI-NEXT: v_mov_b32_e32 v10, s27 -; VI-NEXT: v_readlane_b32 s27, v62, 8 -; VI-NEXT: v_mov_b32_e32 v12, s27 -; VI-NEXT: v_readlane_b32 s27, v62, 9 -; VI-NEXT: v_mov_b32_e32 v6, s27 -; VI-NEXT: v_readlane_b32 s27, v62, 10 -; VI-NEXT: v_mov_b32_e32 v30, s83 -; VI-NEXT: v_mov_b32_e32 v8, s27 -; VI-NEXT: v_readlane_b32 s27, v62, 11 -; VI-NEXT: v_mov_b32_e32 v29, s78 -; VI-NEXT: v_mov_b32_e32 v2, s27 -; VI-NEXT: v_readlane_b32 s27, v62, 12 -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v15, s54 -; VI-NEXT: v_mov_b32_e32 v21, s50 -; VI-NEXT: v_mov_b32_e32 v61, s81 -; VI-NEXT: v_mov_b32_e32 v9, s80 -; VI-NEXT: v_mov_b32_e32 v59, s87 -; VI-NEXT: v_mov_b32_e32 v60, s86 -; VI-NEXT: v_mov_b32_e32 v57, s71 -; VI-NEXT: v_mov_b32_e32 v58, s70 -; VI-NEXT: v_mov_b32_e32 v47, s69 -; VI-NEXT: v_mov_b32_e32 v56, s68 -; VI-NEXT: v_mov_b32_e32 v51, s67 -; VI-NEXT: v_mov_b32_e32 v53, s66 -; VI-NEXT: v_mov_b32_e32 v38, s85 -; VI-NEXT: v_mov_b32_e32 v49, s84 -; VI-NEXT: v_mov_b32_e32 v34, s65 -; VI-NEXT: v_mov_b32_e32 v36, s64 -; VI-NEXT: v_mov_b32_e32 v32, s55 -; VI-NEXT: v_mov_b32_e32 v28, s82 -; VI-NEXT: v_mov_b32_e32 v4, s27 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v2, s36 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v2, s38 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 0 +; VI-NEXT: v_mov_b32_e32 v51, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 1 +; VI-NEXT: v_mov_b32_e32 v34, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 2 +; VI-NEXT: v_mov_b32_e32 v36, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 3 +; VI-NEXT: v_mov_b32_e32 v30, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 4 +; VI-NEXT: v_mov_b32_e32 v32, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 5 +; VI-NEXT: v_mov_b32_e32 v26, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 6 +; VI-NEXT: v_mov_b32_e32 v28, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 7 +; VI-NEXT: v_mov_b32_e32 v22, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 8 +; VI-NEXT: v_mov_b32_e32 v24, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 9 +; VI-NEXT: v_mov_b32_e32 v18, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 10 +; VI-NEXT: v_mov_b32_e32 v20, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 11 +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 12 +; VI-NEXT: v_mov_b32_e32 v16, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 13 +; VI-NEXT: v_mov_b32_e32 v10, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 14 +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 15 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 16 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 17 ; VI-NEXT: v_mov_b32_e32 v52, s56 ; VI-NEXT: v_mov_b32_e32 v50, s46 -; VI-NEXT: v_mov_b32_e32 v37, s44 -; VI-NEXT: v_mov_b32_e32 v33, s42 +; VI-NEXT: v_mov_b32_e32 v48, s44 +; VI-NEXT: v_mov_b32_e32 v35, s42 ; VI-NEXT: v_mov_b32_e32 v31, s40 ; VI-NEXT: v_mov_b32_e32 v27, s24 -; VI-NEXT: v_mov_b32_e32 v25, s20 -; VI-NEXT: v_mov_b32_e32 v19, s22 -; VI-NEXT: v_mov_b32_e32 v23, s18 -; VI-NEXT: v_mov_b32_e32 v17, s16 -; VI-NEXT: v_mov_b32_e32 v13, s14 -; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: v_mov_b32_e32 v23, s22 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: v_mov_b32_e32 v11, s14 +; VI-NEXT: v_mov_b32_e32 v9, s12 ; VI-NEXT: v_mov_b32_e32 v7, s10 ; VI-NEXT: v_mov_b32_e32 v5, s8 -; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v55, s48 -; VI-NEXT: v_mov_b32_e32 v46, s38 -; VI-NEXT: v_mov_b32_e32 v45, s36 -; VI-NEXT: v_mov_b32_e32 v44, s34 -; VI-NEXT: v_mov_b32_e32 v43, s30 -; VI-NEXT: v_mov_b32_e32 v42, s90 -; VI-NEXT: v_mov_b32_e32 v41, s88 -; VI-NEXT: v_mov_b32_e32 v54, s74 -; VI-NEXT: v_mov_b32_e32 v48, s72 -; VI-NEXT: v_mov_b32_e32 v35, s62 -; VI-NEXT: v_mov_b32_e32 v29, s60 -; VI-NEXT: v_readlane_b32 s10, v62, 13 -; VI-NEXT: v_readlane_b32 s12, v62, 14 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v39, s26 -; VI-NEXT: v_readlane_b32 s14, v62, 15 -; VI-NEXT: v_readlane_b32 s16, v62, 16 -; VI-NEXT: v_readlane_b32 s18, v62, 17 -; VI-NEXT: v_readlane_b32 s20, v62, 18 -; VI-NEXT: v_readlane_b32 s22, v62, 19 -; VI-NEXT: v_readlane_b32 s24, v62, 20 -; VI-NEXT: v_readlane_b32 s26, v62, 21 -; VI-NEXT: v_readlane_b32 s27, v62, 22 -; VI-NEXT: v_readlane_b32 s28, v62, 23 -; VI-NEXT: v_readlane_b32 s29, v62, 24 -; VI-NEXT: v_readlane_b32 s40, v62, 25 -; VI-NEXT: v_readlane_b32 s42, v62, 26 -; VI-NEXT: v_readlane_b32 s44, v62, 27 -; VI-NEXT: v_readlane_b32 s46, v62, 28 -; VI-NEXT: v_readlane_b32 s56, v62, 29 -; VI-NEXT: v_readlane_b32 s58, v62, 30 -; VI-NEXT: v_readlane_b32 s59, v62, 31 -; VI-NEXT: v_readlane_b32 s60, v62, 32 -; VI-NEXT: v_readlane_b32 s61, v62, 33 -; VI-NEXT: v_readlane_b32 s62, v62, 34 -; VI-NEXT: v_readlane_b32 s63, v62, 35 -; VI-NEXT: v_readlane_b32 s72, v62, 36 -; VI-NEXT: v_readlane_b32 s73, v62, 37 -; VI-NEXT: v_readlane_b32 s74, v62, 38 -; VI-NEXT: v_readlane_b32 s75, v62, 39 -; VI-NEXT: v_readlane_b32 s76, v62, 40 -; VI-NEXT: v_readlane_b32 s77, v62, 41 -; VI-NEXT: v_readlane_b32 s78, v62, 42 -; VI-NEXT: v_readlane_b32 s79, v62, 43 -; VI-NEXT: v_readlane_b32 s88, v62, 44 -; VI-NEXT: v_readlane_b32 s89, v62, 45 -; VI-NEXT: v_readlane_b32 s90, v62, 46 -; VI-NEXT: v_readlane_b32 s91, v62, 47 -; VI-NEXT: v_readlane_b32 s30, v62, 48 -; VI-NEXT: v_readlane_b32 s31, v62, 49 -; VI-NEXT: v_readlane_b32 s34, v62, 50 -; VI-NEXT: v_readlane_b32 s35, v62, 51 -; VI-NEXT: v_readlane_b32 s36, v62, 52 -; VI-NEXT: v_readlane_b32 s37, v62, 53 -; VI-NEXT: v_readlane_b32 s38, v62, 54 -; VI-NEXT: v_readlane_b32 s39, v62, 55 -; VI-NEXT: v_readlane_b32 s48, v62, 56 -; VI-NEXT: v_readlane_b32 s49, v62, 57 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v46, s26 +; VI-NEXT: v_mov_b32_e32 v21, s64 +; VI-NEXT: v_mov_b32_e32 v25, s71 +; VI-NEXT: v_mov_b32_e32 v45, s28 +; VI-NEXT: v_mov_b32_e32 v61, s55 +; VI-NEXT: v_mov_b32_e32 v15, s70 +; VI-NEXT: v_mov_b32_e32 v44, s58 +; VI-NEXT: v_mov_b32_e32 v59, s54 +; VI-NEXT: v_mov_b32_e32 v60, s82 +; VI-NEXT: v_mov_b32_e32 v43, s60 +; VI-NEXT: v_mov_b32_e32 v57, s81 +; VI-NEXT: v_mov_b32_e32 v58, s69 +; VI-NEXT: v_mov_b32_e32 v42, s62 +; VI-NEXT: v_mov_b32_e32 v47, s67 +; VI-NEXT: v_mov_b32_e32 v56, s80 +; VI-NEXT: v_mov_b32_e32 v41, s72 +; VI-NEXT: v_mov_b32_e32 v53, s66 +; VI-NEXT: v_mov_b32_e32 v55, s65 +; VI-NEXT: v_mov_b32_e32 v40, s74 +; VI-NEXT: v_mov_b32_e32 v49, s68 +; VI-NEXT: v_mov_b32_e32 v39, s76 +; VI-NEXT: v_mov_b32_e32 v54, s78 +; VI-NEXT: v_mov_b32_e32 v38, s88 +; VI-NEXT: v_mov_b32_e32 v37, s90 +; VI-NEXT: v_mov_b32_e32 v33, s30 +; VI-NEXT: v_mov_b32_e32 v29, s34 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v2, s48 +; VI-NEXT: v_readlane_b32 s10, v62, 19 +; VI-NEXT: v_readlane_b32 s12, v62, 20 +; VI-NEXT: v_readlane_b32 s14, v62, 21 +; VI-NEXT: v_readlane_b32 s16, v62, 22 +; VI-NEXT: v_readlane_b32 s18, v62, 23 +; VI-NEXT: v_readlane_b32 s20, v62, 24 +; VI-NEXT: v_readlane_b32 s22, v62, 25 +; VI-NEXT: v_readlane_b32 s24, v62, 26 +; VI-NEXT: v_readlane_b32 s26, v62, 27 +; VI-NEXT: v_readlane_b32 s27, v62, 28 +; VI-NEXT: v_readlane_b32 s28, v62, 29 +; VI-NEXT: v_readlane_b32 s29, v62, 30 +; VI-NEXT: v_readlane_b32 s40, v62, 31 +; VI-NEXT: v_readlane_b32 s42, v62, 32 +; VI-NEXT: v_readlane_b32 s44, v62, 33 +; VI-NEXT: v_readlane_b32 s46, v62, 34 +; VI-NEXT: v_readlane_b32 s56, v62, 35 +; VI-NEXT: v_readlane_b32 s58, v62, 36 +; VI-NEXT: v_readlane_b32 s59, v62, 37 +; VI-NEXT: v_readlane_b32 s60, v62, 38 +; VI-NEXT: v_readlane_b32 s61, v62, 39 +; VI-NEXT: v_readlane_b32 s62, v62, 40 +; VI-NEXT: v_readlane_b32 s63, v62, 41 +; VI-NEXT: v_readlane_b32 s72, v62, 42 +; VI-NEXT: v_readlane_b32 s73, v62, 43 +; VI-NEXT: v_readlane_b32 s74, v62, 44 +; VI-NEXT: v_readlane_b32 s75, v62, 45 +; VI-NEXT: v_readlane_b32 s76, v62, 46 +; VI-NEXT: v_readlane_b32 s77, v62, 47 +; VI-NEXT: v_readlane_b32 s78, v62, 48 +; VI-NEXT: v_readlane_b32 s79, v62, 49 +; VI-NEXT: v_readlane_b32 s88, v62, 50 +; VI-NEXT: v_readlane_b32 s89, v62, 51 +; VI-NEXT: v_readlane_b32 s90, v62, 52 +; VI-NEXT: v_readlane_b32 s91, v62, 53 +; VI-NEXT: v_readlane_b32 s30, v62, 54 +; VI-NEXT: v_readlane_b32 s31, v62, 55 +; VI-NEXT: v_readlane_b32 s34, v62, 56 +; VI-NEXT: v_readlane_b32 s35, v62, 57 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 18 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: .LBB73_5: ; %end ; VI-NEXT: s_and_b32 s4, s57, 0xff ; VI-NEXT: s_lshl_b32 s6, s53, 8 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s52, 0xff ; VI-NEXT: s_lshl_b32 s8, s51, 8 -; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_or_b32_sdwa v21, v52, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v55 +; VI-NEXT: v_or_b32_sdwa v25, v52, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v46 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v15, v15, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v21, s4 +; VI-NEXT: v_or_b32_sdwa v21, v25, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v25, s4 ; VI-NEXT: s_and_b32 s4, s47, 0xff -; VI-NEXT: s_lshl_b32 s6, s49, 8 -; VI-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v15, vcc, 4, v0 +; VI-NEXT: s_lshl_b32 s6, s50, 8 +; VI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v21, vcc, 4, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: s_and_b32 s6, s48, 0xff -; VI-NEXT: s_lshl_b32 s8, s39, 8 -; VI-NEXT: buffer_store_dword v21, v15, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v46 +; VI-NEXT: s_and_b32 s6, s87, 0xff +; VI-NEXT: s_lshl_b32 s8, s86, 8 +; VI-NEXT: buffer_store_dword v25, v21, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v45 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_or_b32_sdwa v9, v50, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v61, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v50, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v61, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v15, vcc, 8, v0 +; VI-NEXT: v_or_b32_sdwa v15, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v21, vcc, 8, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: buffer_store_dword v15, v21, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v21, s4 ; VI-NEXT: s_and_b32 s4, s45, 0xff -; VI-NEXT: s_lshl_b32 s6, s38, 8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 12, v0 +; VI-NEXT: s_lshl_b32 s6, s85, 8 +; VI-NEXT: v_add_u32_e32 v15, vcc, 12, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: s_and_b32 s6, s37, 0xff -; VI-NEXT: s_lshl_b32 s8, s36, 8 -; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v60 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v45 +; VI-NEXT: s_and_b32 s6, s84, 0xff +; VI-NEXT: s_lshl_b32 s8, s83, 8 +; VI-NEXT: buffer_store_dword v21, v15, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v60 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v44 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_or_b32_sdwa v9, v37, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v59, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v59, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v15, vcc, 16, v0 +; VI-NEXT: v_or_b32_sdwa v15, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v21, vcc, 16, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: buffer_store_dword v15, v21, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v21, s4 ; VI-NEXT: s_and_b32 s4, s43, 0xff ; VI-NEXT: s_lshl_b32 s6, s35, 8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 20, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 20, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s34, 0xff ; VI-NEXT: s_lshl_b32 s8, s31, 8 -; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v58 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v44 +; VI-NEXT: buffer_store_dword v21, v15, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v58 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v43 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_or_b32_sdwa v9, v33, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v57, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v57, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v15, vcc, 24, v0 +; VI-NEXT: v_or_b32_sdwa v15, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v21, vcc, 24, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: buffer_store_dword v15, v21, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v21, s4 ; VI-NEXT: s_and_b32 s4, s41, 0xff ; VI-NEXT: s_lshl_b32 s6, s30, 8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 28, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 28, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s91, 0xff ; VI-NEXT: s_lshl_b32 s8, s90, 8 -; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v56 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 +; VI-NEXT: buffer_store_dword v21, v15, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v56 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v42 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_or_b32_sdwa v9, v31, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v47, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v15, vcc, 32, v0 +; VI-NEXT: v_or_b32_sdwa v15, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v21, vcc, 32, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: buffer_store_dword v15, v21, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v21, s4 ; VI-NEXT: s_and_b32 s4, s25, 0xff ; VI-NEXT: s_lshl_b32 s6, s89, 8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 36, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 36, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s88, 0xff ; VI-NEXT: s_lshl_b32 s8, s79, 8 -; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v53 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v42 +; VI-NEXT: buffer_store_dword v21, v15, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v55 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v41 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_or_b32_sdwa v9, v27, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v51, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v15, vcc, 40, v0 +; VI-NEXT: v_or_b32_sdwa v15, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v21, vcc, 40, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v15, s4 -; VI-NEXT: s_and_b32 s4, s21, 0xff +; VI-NEXT: buffer_store_dword v15, v21, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v21, s4 +; VI-NEXT: s_and_b32 s4, s23, 0xff ; VI-NEXT: s_lshl_b32 s6, s78, 8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 44, v0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 44, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s77, 0xff ; VI-NEXT: s_lshl_b32 s8, s76, 8 -; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v49 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v41 +; VI-NEXT: buffer_store_dword v21, v15, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v51 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v40 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_or_b32_sdwa v9, v25, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v15, vcc, 48, v0 +; VI-NEXT: v_or_b32_sdwa v15, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v21, vcc, 48, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v9, vcc, 52, v0 -; VI-NEXT: v_mov_b32_e32 v15, s4 -; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v36 -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: s_and_b32 s4, s23, 0xff +; VI-NEXT: buffer_store_dword v15, v21, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v15, vcc, 52, v0 +; VI-NEXT: v_mov_b32_e32 v21, s4 +; VI-NEXT: s_and_b32 s4, s19, 0xff ; VI-NEXT: s_lshl_b32 s6, s75, 8 +; VI-NEXT: buffer_store_dword v21, v15, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v36 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s74, 0xff ; VI-NEXT: s_lshl_b32 s8, s73, 8 +; VI-NEXT: v_or_b32_sdwa v15, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v39 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_or_b32_sdwa v9, v19, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_or_b32_sdwa v15, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v19, vcc, 56, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v15, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v19, s4 +; VI-NEXT: s_and_b32 s4, s21, 0xff ; VI-NEXT: s_lshl_b32 s6, s72, 8 -; VI-NEXT: s_lshl_b32 s8, s62, 8 -; VI-NEXT: v_readlane_b32 s87, v63, 31 -; VI-NEXT: v_readlane_b32 s86, v63, 30 -; VI-NEXT: v_readlane_b32 s85, v63, 29 -; VI-NEXT: v_readlane_b32 s84, v63, 28 -; VI-NEXT: v_readlane_b32 s83, v63, 27 -; VI-NEXT: v_readlane_b32 s82, v63, 26 -; VI-NEXT: v_readlane_b32 s81, v63, 25 -; VI-NEXT: v_readlane_b32 s80, v63, 24 -; VI-NEXT: v_readlane_b32 s71, v63, 23 -; VI-NEXT: v_readlane_b32 s70, v63, 22 -; VI-NEXT: v_readlane_b32 s69, v63, 21 -; VI-NEXT: v_readlane_b32 s68, v63, 20 -; VI-NEXT: v_readlane_b32 s67, v63, 19 -; VI-NEXT: v_readlane_b32 s66, v63, 18 -; VI-NEXT: v_readlane_b32 s65, v63, 17 -; VI-NEXT: v_readlane_b32 s64, v63, 16 -; VI-NEXT: v_readlane_b32 s55, v63, 15 -; VI-NEXT: v_readlane_b32 s54, v63, 14 -; VI-NEXT: v_readlane_b32 s53, v63, 13 -; VI-NEXT: v_readlane_b32 s52, v63, 12 -; VI-NEXT: v_readlane_b32 s51, v63, 11 -; VI-NEXT: v_readlane_b32 s50, v63, 10 -; VI-NEXT: v_readlane_b32 s49, v63, 9 -; VI-NEXT: v_readlane_b32 s48, v63, 8 -; VI-NEXT: v_readlane_b32 s39, v63, 7 -; VI-NEXT: v_readlane_b32 s38, v63, 6 -; VI-NEXT: v_readlane_b32 s37, v63, 5 -; VI-NEXT: v_readlane_b32 s36, v63, 4 -; VI-NEXT: v_readlane_b32 s35, v63, 3 -; VI-NEXT: v_readlane_b32 s34, v63, 2 -; VI-NEXT: v_readlane_b32 s31, v63, 1 -; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v36 -; VI-NEXT: v_or_b32_sdwa v15, v34, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v15, vcc, 56, v0 -; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v15, s4 -; VI-NEXT: s_and_b32 s4, s19, 0xff -; VI-NEXT: v_add_u32_e32 v9, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v19, v15, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v32 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s63, 0xff -; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v32 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v40 +; VI-NEXT: s_lshl_b32 s8, s62, 8 +; VI-NEXT: v_or_b32_sdwa v15, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v54 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_or_b32_sdwa v9, v23, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v30, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v30, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v15, vcc, 64, v0 +; VI-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x44, v0 +; VI-NEXT: v_mov_b32_e32 v17, s4 ; VI-NEXT: s_and_b32 s4, s17, 0xff ; VI-NEXT: s_lshl_b32 s6, s61, 8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v17, v15, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v28 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s60, 0xff ; VI-NEXT: s_lshl_b32 s8, s59, 8 -; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v28 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v54 +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v38 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_or_b32_sdwa v9, v17, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x48, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v9, v15, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x4c, v0 ; VI-NEXT: v_mov_b32_e32 v15, s4 ; VI-NEXT: s_and_b32 s4, s15, 0xff ; VI-NEXT: s_lshl_b32 s6, s58, 8 -; VI-NEXT: buffer_store_dword v15, v9, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v24 +; VI-NEXT: buffer_store_dword v15, v13, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v24 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s56, 0xff ; VI-NEXT: s_lshl_b32 s8, s46, 8 -; VI-NEXT: v_or_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v48 +; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v37 ; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_or_b32_sdwa v13, v22, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v9, v9, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x50, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v9, v13, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x54, v0 ; VI-NEXT: v_mov_b32_e32 v13, s4 ; VI-NEXT: s_and_b32 s4, s13, 0xff ; VI-NEXT: s_lshl_b32 s6, s44, 8 -; VI-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v20 +; VI-NEXT: buffer_store_dword v13, v11, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v20 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s42, 0xff ; VI-NEXT: s_lshl_b32 s8, s40, 8 -; VI-NEXT: v_or_b32_sdwa v9, v11, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v35 +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 ; VI-NEXT: s_or_b32 s6, s6, s8 ; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff @@ -119628,7 +119104,39 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: s_lshl_b32 s6, s16, 8 +; VI-NEXT: s_lshl_b32 s6, s20, 8 +; VI-NEXT: v_readlane_b32 s87, v63, 31 +; VI-NEXT: v_readlane_b32 s86, v63, 30 +; VI-NEXT: v_readlane_b32 s85, v63, 29 +; VI-NEXT: v_readlane_b32 s84, v63, 28 +; VI-NEXT: v_readlane_b32 s83, v63, 27 +; VI-NEXT: v_readlane_b32 s82, v63, 26 +; VI-NEXT: v_readlane_b32 s81, v63, 25 +; VI-NEXT: v_readlane_b32 s80, v63, 24 +; VI-NEXT: v_readlane_b32 s71, v63, 23 +; VI-NEXT: v_readlane_b32 s70, v63, 22 +; VI-NEXT: v_readlane_b32 s69, v63, 21 +; VI-NEXT: v_readlane_b32 s68, v63, 20 +; VI-NEXT: v_readlane_b32 s67, v63, 19 +; VI-NEXT: v_readlane_b32 s66, v63, 18 +; VI-NEXT: v_readlane_b32 s65, v63, 17 +; VI-NEXT: v_readlane_b32 s64, v63, 16 +; VI-NEXT: v_readlane_b32 s55, v63, 15 +; VI-NEXT: v_readlane_b32 s54, v63, 14 +; VI-NEXT: v_readlane_b32 s53, v63, 13 +; VI-NEXT: v_readlane_b32 s52, v63, 12 +; VI-NEXT: v_readlane_b32 s51, v63, 11 +; VI-NEXT: v_readlane_b32 s50, v63, 10 +; VI-NEXT: v_readlane_b32 s49, v63, 9 +; VI-NEXT: v_readlane_b32 s48, v63, 8 +; VI-NEXT: v_readlane_b32 s39, v63, 7 +; VI-NEXT: v_readlane_b32 s38, v63, 6 +; VI-NEXT: v_readlane_b32 s37, v63, 5 +; VI-NEXT: v_readlane_b32 s36, v63, 4 +; VI-NEXT: v_readlane_b32 s35, v63, 3 +; VI-NEXT: v_readlane_b32 s34, v63, 2 +; VI-NEXT: v_readlane_b32 s31, v63, 1 +; VI-NEXT: v_readlane_b32 s30, v63, 0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v11 ; VI-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -119641,16 +119149,15 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: s_and_b32 s4, s5, 0xff -; VI-NEXT: s_lshl_b32 s5, s20, 8 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_and_b32 s4, s7, 0xff +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s18, 0xff +; VI-NEXT: s_lshl_b32 s7, s16, 8 ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_lshl_b32 s5, s14, 8 +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_lshl_b32 s6, s10, 8 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 @@ -119665,7 +119172,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: s_and_b32 s4, s7, 0xff +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: s_lshl_b32 s5, s14, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s12, 0xff ; VI-NEXT: s_or_b32 s5, s5, s6 @@ -119696,8 +119204,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -119706,8 +119214,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v63, s30, 0 ; GFX9-NEXT: v_writelane_b32 v63, s31, 1 @@ -119774,10 +119282,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; GFX9-NEXT: v_writelane_b32 v63, s99, 35 ; GFX9-NEXT: v_readfirstlane_b32 s23, v20 -; GFX9-NEXT: v_readfirstlane_b32 s20, v1 -; GFX9-NEXT: v_readfirstlane_b32 s21, v2 -; GFX9-NEXT: v_readfirstlane_b32 s18, v3 -; GFX9-NEXT: v_readfirstlane_b32 s19, v4 +; GFX9-NEXT: v_readfirstlane_b32 s18, v1 +; GFX9-NEXT: v_readfirstlane_b32 s19, v2 +; GFX9-NEXT: v_readfirstlane_b32 s20, v3 +; GFX9-NEXT: v_readfirstlane_b32 s21, v4 ; GFX9-NEXT: v_readfirstlane_b32 s16, v5 ; GFX9-NEXT: v_readfirstlane_b32 s17, v6 ; GFX9-NEXT: v_readfirstlane_b32 s14, v7 @@ -119788,11 +119296,11 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readfirstlane_b32 s11, v12 ; GFX9-NEXT: v_readfirstlane_b32 s8, v13 ; GFX9-NEXT: v_readfirstlane_b32 s9, v14 -; GFX9-NEXT: v_readfirstlane_b32 s4, v15 -; GFX9-NEXT: v_readfirstlane_b32 s5, v16 -; GFX9-NEXT: v_readfirstlane_b32 s6, v17 +; GFX9-NEXT: v_readfirstlane_b32 s6, v15 +; GFX9-NEXT: v_readfirstlane_b32 s7, v16 +; GFX9-NEXT: v_readfirstlane_b32 s4, v17 ; GFX9-NEXT: s_and_b64 s[26:27], vcc, exec -; GFX9-NEXT: v_readfirstlane_b32 s7, v18 +; GFX9-NEXT: v_readfirstlane_b32 s5, v18 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill @@ -119810,222 +119318,219 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; GFX9-NEXT: s_cbranch_scc0 .LBB73_3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_lshr_b32 s26, s7, 24 -; GFX9-NEXT: v_writelane_b32 v62, s26, 2 -; GFX9-NEXT: s_lshr_b32 s26, s7, 16 -; GFX9-NEXT: v_writelane_b32 v62, s26, 3 -; GFX9-NEXT: s_lshr_b32 s26, s7, 8 -; GFX9-NEXT: v_writelane_b32 v62, s26, 4 -; GFX9-NEXT: s_lshr_b32 s26, s6, 16 -; GFX9-NEXT: v_writelane_b32 v62, s26, 0 -; GFX9-NEXT: s_lshr_b32 s26, s6, 8 -; GFX9-NEXT: v_writelane_b32 v62, s26, 1 ; GFX9-NEXT: s_lshr_b32 s26, s5, 24 -; GFX9-NEXT: v_writelane_b32 v62, s26, 5 +; GFX9-NEXT: v_writelane_b32 v62, s26, 14 ; GFX9-NEXT: s_lshr_b32 s26, s5, 16 -; GFX9-NEXT: v_writelane_b32 v62, s26, 6 +; GFX9-NEXT: v_writelane_b32 v62, s26, 15 ; GFX9-NEXT: s_lshr_b32 s26, s5, 8 -; GFX9-NEXT: v_writelane_b32 v62, s26, 7 -; GFX9-NEXT: s_lshr_b32 s26, s9, 24 -; GFX9-NEXT: v_writelane_b32 v62, s26, 8 -; GFX9-NEXT: s_lshr_b32 s26, s9, 16 -; GFX9-NEXT: v_writelane_b32 v62, s26, 9 -; GFX9-NEXT: s_lshr_b32 s26, s9, 8 -; GFX9-NEXT: v_writelane_b32 v62, s26, 10 -; GFX9-NEXT: s_lshr_b32 s26, s11, 24 -; GFX9-NEXT: v_writelane_b32 v62, s26, 11 -; GFX9-NEXT: s_lshr_b32 s26, s11, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 16 +; GFX9-NEXT: s_lshr_b32 s26, s4, 16 ; GFX9-NEXT: v_writelane_b32 v62, s26, 12 -; GFX9-NEXT: s_lshr_b32 s26, s11, 8 +; GFX9-NEXT: s_lshr_b32 s26, s4, 8 ; GFX9-NEXT: v_writelane_b32 v62, s26, 13 -; GFX9-NEXT: s_lshr_b32 s26, s13, 24 -; GFX9-NEXT: v_writelane_b32 v62, s26, 14 -; GFX9-NEXT: s_lshr_b32 s26, s13, 16 -; GFX9-NEXT: v_writelane_b32 v62, s26, 15 -; GFX9-NEXT: s_lshr_b32 s26, s13, 8 -; GFX9-NEXT: v_writelane_b32 v62, s26, 16 -; GFX9-NEXT: s_lshr_b32 s26, s15, 24 +; GFX9-NEXT: s_lshr_b32 s26, s7, 24 ; GFX9-NEXT: v_writelane_b32 v62, s26, 17 -; GFX9-NEXT: s_lshr_b32 s26, s15, 16 +; GFX9-NEXT: s_lshr_b32 s26, s7, 16 ; GFX9-NEXT: v_writelane_b32 v62, s26, 18 -; GFX9-NEXT: s_lshr_b32 s26, s15, 8 +; GFX9-NEXT: s_lshr_b32 s26, s7, 8 ; GFX9-NEXT: v_writelane_b32 v62, s26, 19 -; GFX9-NEXT: s_lshr_b32 s26, s17, 24 +; GFX9-NEXT: s_lshr_b32 s26, s6, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 10 +; GFX9-NEXT: s_lshr_b32 s26, s6, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 11 +; GFX9-NEXT: s_lshr_b32 s26, s9, 24 ; GFX9-NEXT: v_writelane_b32 v62, s26, 20 -; GFX9-NEXT: s_lshr_b32 s26, s17, 16 +; GFX9-NEXT: s_lshr_b32 s26, s9, 16 ; GFX9-NEXT: v_writelane_b32 v62, s26, 21 -; GFX9-NEXT: s_lshr_b32 s26, s17, 8 +; GFX9-NEXT: s_lshr_b32 s26, s9, 8 ; GFX9-NEXT: v_writelane_b32 v62, s26, 22 -; GFX9-NEXT: s_lshr_b32 s26, s19, 24 +; GFX9-NEXT: s_lshr_b32 s26, s8, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 8 +; GFX9-NEXT: s_lshr_b32 s26, s8, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 9 +; GFX9-NEXT: s_lshr_b32 s26, s11, 24 ; GFX9-NEXT: v_writelane_b32 v62, s26, 23 -; GFX9-NEXT: s_lshr_b32 s26, s19, 16 +; GFX9-NEXT: s_lshr_b32 s26, s11, 16 ; GFX9-NEXT: v_writelane_b32 v62, s26, 24 -; GFX9-NEXT: s_lshr_b32 s26, s19, 8 +; GFX9-NEXT: s_lshr_b32 s26, s11, 8 ; GFX9-NEXT: v_writelane_b32 v62, s26, 25 -; GFX9-NEXT: s_lshr_b32 s26, s21, 24 +; GFX9-NEXT: s_lshr_b32 s26, s10, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 6 +; GFX9-NEXT: s_lshr_b32 s26, s10, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 7 +; GFX9-NEXT: s_lshr_b32 s26, s13, 24 ; GFX9-NEXT: v_writelane_b32 v62, s26, 26 -; GFX9-NEXT: s_lshr_b32 s26, s21, 16 +; GFX9-NEXT: s_lshr_b32 s26, s13, 16 ; GFX9-NEXT: v_writelane_b32 v62, s26, 27 -; GFX9-NEXT: s_lshr_b32 s26, s21, 8 +; GFX9-NEXT: s_lshr_b32 s26, s13, 8 ; GFX9-NEXT: v_writelane_b32 v62, s26, 28 -; GFX9-NEXT: s_lshr_b32 s26, s23, 24 +; GFX9-NEXT: s_lshr_b32 s26, s12, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 4 +; GFX9-NEXT: s_lshr_b32 s26, s12, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 5 +; GFX9-NEXT: s_lshr_b32 s26, s15, 24 ; GFX9-NEXT: v_writelane_b32 v62, s26, 29 -; GFX9-NEXT: s_lshr_b32 s26, s23, 16 +; GFX9-NEXT: s_lshr_b32 s26, s15, 16 ; GFX9-NEXT: v_writelane_b32 v62, s26, 30 -; GFX9-NEXT: s_lshr_b32 s26, s23, 8 +; GFX9-NEXT: s_lshr_b32 s26, s15, 8 ; GFX9-NEXT: v_writelane_b32 v62, s26, 31 -; GFX9-NEXT: s_lshr_b32 s26, s25, 24 +; GFX9-NEXT: s_lshr_b32 s26, s14, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 2 +; GFX9-NEXT: s_lshr_b32 s26, s14, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 3 +; GFX9-NEXT: s_lshr_b32 s26, s17, 24 ; GFX9-NEXT: v_writelane_b32 v62, s26, 32 -; GFX9-NEXT: s_lshr_b32 s26, s25, 16 +; GFX9-NEXT: s_lshr_b32 s26, s17, 16 ; GFX9-NEXT: v_writelane_b32 v62, s26, 33 -; GFX9-NEXT: s_lshr_b32 s26, s25, 8 +; GFX9-NEXT: s_lshr_b32 s26, s17, 8 ; GFX9-NEXT: v_writelane_b32 v62, s26, 34 -; GFX9-NEXT: s_lshr_b32 s26, s41, 24 +; GFX9-NEXT: s_lshr_b32 s26, s16, 16 +; GFX9-NEXT: v_writelane_b32 v62, s26, 0 +; GFX9-NEXT: s_lshr_b32 s26, s16, 8 +; GFX9-NEXT: v_writelane_b32 v62, s26, 1 +; GFX9-NEXT: s_lshr_b32 s26, s21, 24 ; GFX9-NEXT: v_writelane_b32 v62, s26, 35 -; GFX9-NEXT: s_lshr_b32 s26, s41, 16 +; GFX9-NEXT: s_lshr_b32 s26, s21, 16 ; GFX9-NEXT: v_writelane_b32 v62, s26, 36 -; GFX9-NEXT: s_lshr_b32 s26, s41, 8 +; GFX9-NEXT: s_lshr_b32 s26, s21, 8 ; GFX9-NEXT: v_writelane_b32 v62, s26, 37 -; GFX9-NEXT: s_lshr_b32 s26, s43, 24 +; GFX9-NEXT: s_lshr_b32 s26, s19, 24 ; GFX9-NEXT: v_writelane_b32 v62, s26, 38 -; GFX9-NEXT: s_lshr_b32 s26, s43, 16 +; GFX9-NEXT: s_lshr_b32 s26, s19, 16 ; GFX9-NEXT: v_writelane_b32 v62, s26, 39 -; GFX9-NEXT: s_lshr_b32 s26, s43, 8 +; GFX9-NEXT: s_lshr_b32 s26, s19, 8 ; GFX9-NEXT: v_writelane_b32 v62, s26, 40 -; GFX9-NEXT: s_lshr_b32 s26, s45, 24 +; GFX9-NEXT: s_lshr_b32 s26, s23, 24 ; GFX9-NEXT: v_writelane_b32 v62, s26, 41 -; GFX9-NEXT: s_lshr_b32 s26, s45, 16 +; GFX9-NEXT: s_lshr_b32 s26, s23, 16 ; GFX9-NEXT: v_writelane_b32 v62, s26, 42 -; GFX9-NEXT: s_lshr_b32 s26, s45, 8 +; GFX9-NEXT: s_lshr_b32 s26, s23, 8 ; GFX9-NEXT: v_writelane_b32 v62, s26, 43 -; GFX9-NEXT: s_lshr_b32 s26, s47, 24 +; GFX9-NEXT: s_lshr_b32 s26, s25, 24 ; GFX9-NEXT: v_writelane_b32 v62, s26, 44 -; GFX9-NEXT: s_lshr_b32 s26, s47, 16 +; GFX9-NEXT: s_lshr_b32 s26, s25, 16 ; GFX9-NEXT: v_writelane_b32 v62, s26, 45 -; GFX9-NEXT: s_lshr_b32 s26, s47, 8 +; GFX9-NEXT: s_lshr_b32 s26, s25, 8 ; GFX9-NEXT: v_writelane_b32 v62, s26, 46 -; GFX9-NEXT: s_lshr_b32 s26, s57, 24 +; GFX9-NEXT: s_lshr_b32 s26, s41, 24 ; GFX9-NEXT: v_writelane_b32 v62, s26, 47 -; GFX9-NEXT: s_lshr_b32 s26, s57, 16 +; GFX9-NEXT: s_lshr_b32 s26, s41, 16 ; GFX9-NEXT: v_writelane_b32 v62, s26, 48 -; GFX9-NEXT: s_lshr_b32 s26, s57, 8 -; GFX9-NEXT: s_lshr_b32 s83, s4, 16 -; GFX9-NEXT: s_lshr_b32 s82, s4, 8 -; GFX9-NEXT: s_lshr_b32 s85, s8, 16 -; GFX9-NEXT: s_lshr_b32 s84, s8, 8 -; GFX9-NEXT: s_lshr_b32 s87, s10, 16 -; GFX9-NEXT: s_lshr_b32 s86, s10, 8 -; GFX9-NEXT: s_lshr_b32 s97, s12, 16 -; GFX9-NEXT: s_lshr_b32 s96, s12, 8 -; GFX9-NEXT: s_lshr_b32 s99, s14, 16 -; GFX9-NEXT: s_lshr_b32 s98, s14, 8 -; GFX9-NEXT: s_lshr_b32 s39, s16, 16 -; GFX9-NEXT: s_lshr_b32 s38, s16, 8 -; GFX9-NEXT: s_lshr_b32 s49, s18, 16 -; GFX9-NEXT: s_lshr_b32 s48, s18, 8 -; GFX9-NEXT: s_lshr_b32 s51, s20, 16 -; GFX9-NEXT: s_lshr_b32 s50, s20, 8 -; GFX9-NEXT: s_lshr_b32 s53, s22, 16 -; GFX9-NEXT: s_lshr_b32 s52, s22, 8 -; GFX9-NEXT: s_lshr_b32 s55, s24, 16 -; GFX9-NEXT: s_lshr_b32 s54, s24, 8 -; GFX9-NEXT: s_lshr_b32 s65, s40, 16 -; GFX9-NEXT: s_lshr_b32 s64, s40, 8 -; GFX9-NEXT: s_lshr_b32 s67, s42, 16 -; GFX9-NEXT: s_lshr_b32 s66, s42, 8 -; GFX9-NEXT: s_lshr_b32 s69, s44, 16 -; GFX9-NEXT: s_lshr_b32 s68, s44, 8 -; GFX9-NEXT: s_lshr_b32 s71, s46, 16 -; GFX9-NEXT: s_lshr_b32 s70, s46, 8 +; GFX9-NEXT: s_lshr_b32 s26, s41, 8 +; GFX9-NEXT: s_lshr_b32 s39, s20, 16 +; GFX9-NEXT: s_lshr_b32 s38, s20, 8 +; GFX9-NEXT: s_lshr_b32 s83, s18, 16 +; GFX9-NEXT: s_lshr_b32 s82, s18, 8 +; GFX9-NEXT: s_lshr_b32 s84, s22, 16 +; GFX9-NEXT: s_lshr_b32 s48, s22, 8 +; GFX9-NEXT: s_lshr_b32 s86, s24, 16 +; GFX9-NEXT: s_lshr_b32 s85, s24, 8 ; GFX9-NEXT: v_writelane_b32 v62, s26, 49 +; GFX9-NEXT: s_lshr_b32 s49, s40, 16 +; GFX9-NEXT: s_lshr_b32 s87, s40, 8 +; GFX9-NEXT: s_lshr_b32 s52, s43, 24 +; GFX9-NEXT: s_lshr_b32 s53, s43, 16 +; GFX9-NEXT: s_lshr_b32 s54, s43, 8 +; GFX9-NEXT: s_lshr_b32 s96, s42, 16 +; GFX9-NEXT: s_lshr_b32 s50, s42, 8 +; GFX9-NEXT: s_lshr_b32 s55, s45, 24 +; GFX9-NEXT: s_lshr_b32 s64, s45, 16 +; GFX9-NEXT: s_lshr_b32 s65, s45, 8 +; GFX9-NEXT: s_lshr_b32 s51, s44, 16 +; GFX9-NEXT: s_lshr_b32 s97, s44, 8 +; GFX9-NEXT: s_lshr_b32 s66, s47, 24 +; GFX9-NEXT: s_lshr_b32 s67, s47, 16 +; GFX9-NEXT: s_lshr_b32 s68, s47, 8 +; GFX9-NEXT: s_lshr_b32 s99, s46, 16 +; GFX9-NEXT: s_lshr_b32 s98, s46, 8 +; GFX9-NEXT: s_lshr_b32 s69, s57, 24 +; GFX9-NEXT: s_lshr_b32 s70, s57, 16 +; GFX9-NEXT: s_lshr_b32 s71, s57, 8 ; GFX9-NEXT: s_lshr_b32 s81, s56, 16 ; GFX9-NEXT: s_lshr_b32 s80, s56, 8 -; GFX9-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 -; GFX9-NEXT: s_lshr_b64 s[28:29], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 -; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[8:9], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[10:11], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[16:17], 24 ; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[28:29], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[56:57], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB73_4 ; GFX9-NEXT: .LBB73_2: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[1:2], s[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[25:26], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[29:30], s[40:41], 1.0 +; GFX9-NEXT: v_add_f64 v[33:34], s[42:43], 1.0 ; GFX9-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[35:36], s[40:41], 1.0 -; GFX9-NEXT: v_add_f64 v[37:38], s[42:43], 1.0 -; GFX9-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[48:49], s[44:45], 1.0 -; GFX9-NEXT: v_add_f64 v[13:14], s[14:15], 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] +; GFX9-NEXT: v_add_f64 v[37:38], s[44:45], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[52:53], s[56:57], 1.0 +; GFX9-NEXT: v_add_f64 v[50:51], s[46:47], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], s[14:15], 1.0 ; GFX9-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[52:53], s[46:47], 1.0 -; GFX9-NEXT: v_add_f64 v[15:16], s[16:17], 1.0 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: v_add_f64 v[39:40], s[56:57], 1.0 -; GFX9-NEXT: v_add_f64 v[33:34], s[24:25], 1.0 -; GFX9-NEXT: v_add_f64 v[31:32], s[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[28:29], s[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], s[18:19], 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[35:36] -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] +; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[37:38] -; GFX9-NEXT: v_readfirstlane_b32 s13, v10 -; GFX9-NEXT: v_lshrrev_b64 v[10:11], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[48:49] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] -; GFX9-NEXT: v_lshrrev_b64 v[44:45], 24, v[52:53] -; GFX9-NEXT: v_readfirstlane_b32 s57, v40 -; GFX9-NEXT: v_readfirstlane_b32 s47, v53 -; GFX9-NEXT: v_readfirstlane_b32 s45, v49 -; GFX9-NEXT: v_readfirstlane_b32 s43, v38 -; GFX9-NEXT: v_readfirstlane_b32 s41, v36 -; GFX9-NEXT: v_readfirstlane_b32 s25, v34 -; GFX9-NEXT: v_readfirstlane_b32 s23, v32 -; GFX9-NEXT: v_readfirstlane_b32 s21, v29 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[29:30] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[44:45], 24, v[33:34] +; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[45:46], 24, v[37:38] +; GFX9-NEXT: v_readfirstlane_b32 s57, v53 +; GFX9-NEXT: v_readfirstlane_b32 s47, v51 +; GFX9-NEXT: v_readfirstlane_b32 s45, v38 +; GFX9-NEXT: v_readfirstlane_b32 s43, v34 +; GFX9-NEXT: v_readfirstlane_b32 s41, v30 +; GFX9-NEXT: v_readfirstlane_b32 s25, v26 +; GFX9-NEXT: v_readfirstlane_b32 s23, v24 ; GFX9-NEXT: v_readfirstlane_b32 s19, v20 -; GFX9-NEXT: v_readfirstlane_b32 s17, v16 -; GFX9-NEXT: v_readfirstlane_b32 s15, v14 +; GFX9-NEXT: v_readfirstlane_b32 s21, v18 +; GFX9-NEXT: v_readfirstlane_b32 s17, v14 +; GFX9-NEXT: v_readfirstlane_b32 s15, v12 +; GFX9-NEXT: v_readfirstlane_b32 s13, v10 ; GFX9-NEXT: v_readfirstlane_b32 s11, v8 ; GFX9-NEXT: v_readfirstlane_b32 s9, v6 -; GFX9-NEXT: v_readfirstlane_b32 s5, v4 -; GFX9-NEXT: v_readfirstlane_b32 s7, v2 -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b64 v[29:30], 24, v[28:29] -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[31:32] -; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[33:34] -; GFX9-NEXT: v_lshrrev_b64 v[45:46], 24, v[39:40] -; GFX9-NEXT: s_lshr_b32 s10, s7, 24 -; GFX9-NEXT: s_lshr_b32 s12, s7, 16 -; GFX9-NEXT: s_lshr_b32 s14, s7, 8 +; GFX9-NEXT: v_readfirstlane_b32 s7, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[31:32], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[35:36], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[48:49], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[50:51] +; GFX9-NEXT: v_lshrrev_b64 v[55:56], 24, v[52:53] +; GFX9-NEXT: s_lshr_b32 s10, s5, 24 +; GFX9-NEXT: s_lshr_b32 s12, s5, 16 +; GFX9-NEXT: s_lshr_b32 s14, s5, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: s_lshr_b32 s16, s5, 24 -; GFX9-NEXT: s_lshr_b32 s18, s5, 16 -; GFX9-NEXT: s_lshr_b32 s20, s5, 8 +; GFX9-NEXT: s_lshr_b32 s16, s7, 24 +; GFX9-NEXT: s_lshr_b32 s18, s7, 16 +; GFX9-NEXT: s_lshr_b32 s20, s7, 8 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX9-NEXT: s_lshr_b32 s22, s9, 24 @@ -120046,537 +119551,567 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: s_lshr_b32 s46, s15, 24 ; GFX9-NEXT: s_lshr_b32 s56, s15, 16 ; GFX9-NEXT: s_lshr_b32 s58, s15, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: s_lshr_b32 s59, s17, 24 ; GFX9-NEXT: s_lshr_b32 s60, s17, 16 ; GFX9-NEXT: s_lshr_b32 s61, s17, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v15 -; GFX9-NEXT: s_lshr_b32 s62, s19, 24 -; GFX9-NEXT: s_lshr_b32 s63, s19, 16 -; GFX9-NEXT: s_lshr_b32 s72, s19, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v19 -; GFX9-NEXT: s_lshr_b32 s73, s21, 24 -; GFX9-NEXT: s_lshr_b32 s74, s21, 16 -; GFX9-NEXT: s_lshr_b32 s75, s21, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v13 +; GFX9-NEXT: s_lshr_b32 s62, s21, 24 +; GFX9-NEXT: s_lshr_b32 s63, s21, 16 +; GFX9-NEXT: s_lshr_b32 s72, s21, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v17 +; GFX9-NEXT: s_lshr_b32 s73, s19, 24 +; GFX9-NEXT: s_lshr_b32 s74, s19, 16 +; GFX9-NEXT: s_lshr_b32 s75, s19, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v19 ; GFX9-NEXT: s_lshr_b32 s76, s23, 24 ; GFX9-NEXT: s_lshr_b32 s77, s23, 16 ; GFX9-NEXT: s_lshr_b32 s78, s23, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v23 ; GFX9-NEXT: s_lshr_b32 s79, s25, 24 ; GFX9-NEXT: s_lshr_b32 s88, s25, 16 ; GFX9-NEXT: s_lshr_b32 s89, s25, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v25 ; GFX9-NEXT: s_lshr_b32 s90, s41, 24 ; GFX9-NEXT: s_lshr_b32 s91, s41, 16 ; GFX9-NEXT: s_lshr_b32 s92, s41, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v35 -; GFX9-NEXT: s_lshr_b32 s93, s43, 24 -; GFX9-NEXT: s_lshr_b32 s94, s43, 16 -; GFX9-NEXT: s_lshr_b32 s95, s43, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v37 -; GFX9-NEXT: s_lshr_b32 vcc_lo, s45, 24 -; GFX9-NEXT: s_lshr_b32 vcc_hi, s45, 16 -; GFX9-NEXT: s_lshr_b32 s30, s45, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v48 -; GFX9-NEXT: s_lshr_b32 s31, s47, 24 -; GFX9-NEXT: s_lshr_b32 s34, s47, 16 -; GFX9-NEXT: s_lshr_b32 s35, s47, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v52 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v52 -; GFX9-NEXT: s_lshr_b32 s8, s57, 24 -; GFX9-NEXT: s_lshr_b32 s36, s57, 16 -; GFX9-NEXT: s_lshr_b32 s6, s57, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v39 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v29 +; GFX9-NEXT: s_lshr_b32 s52, s43, 24 +; GFX9-NEXT: s_lshr_b32 s53, s43, 16 +; GFX9-NEXT: s_lshr_b32 s54, s43, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v33 +; GFX9-NEXT: s_lshr_b32 s55, s45, 24 +; GFX9-NEXT: s_lshr_b32 s64, s45, 16 +; GFX9-NEXT: s_lshr_b32 s65, s45, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v37 +; GFX9-NEXT: s_lshr_b32 s66, s47, 24 +; GFX9-NEXT: s_lshr_b32 s67, s47, 16 +; GFX9-NEXT: s_lshr_b32 s68, s47, 8 +; GFX9-NEXT: s_lshr_b32 s69, s57, 24 +; GFX9-NEXT: s_lshr_b32 s70, s57, 16 +; GFX9-NEXT: s_lshr_b32 s71, s57, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v52 ; GFX9-NEXT: s_branch .LBB73_5 ; GFX9-NEXT: .LBB73_3: -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 ; GFX9-NEXT: ; implicit-def: $sgpr80 ; GFX9-NEXT: ; implicit-def: $sgpr81 -; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr71 -; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr70 ; GFX9-NEXT: ; implicit-def: $sgpr69 -; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr98 +; GFX9-NEXT: ; implicit-def: $sgpr99 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr68 ; GFX9-NEXT: ; implicit-def: $sgpr67 -; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr51 +; GFX9-NEXT: ; implicit-def: $sgpr58 ; GFX9-NEXT: ; implicit-def: $sgpr65 -; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr64 ; GFX9-NEXT: ; implicit-def: $sgpr55 -; GFX9-NEXT: ; implicit-def: $sgpr52 -; GFX9-NEXT: ; implicit-def: $sgpr53 ; GFX9-NEXT: ; implicit-def: $sgpr50 -; GFX9-NEXT: ; implicit-def: $sgpr51 -; GFX9-NEXT: ; implicit-def: $sgpr48 -; GFX9-NEXT: ; implicit-def: $sgpr49 -; GFX9-NEXT: ; implicit-def: $sgpr38 -; GFX9-NEXT: ; implicit-def: $sgpr39 -; GFX9-NEXT: ; implicit-def: $sgpr98 -; GFX9-NEXT: ; implicit-def: $sgpr99 ; GFX9-NEXT: ; implicit-def: $sgpr96 -; GFX9-NEXT: ; implicit-def: $sgpr97 -; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr52 ; GFX9-NEXT: ; implicit-def: $sgpr87 -; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr62 ; GFX9-NEXT: ; implicit-def: $sgpr85 +; GFX9-NEXT: ; implicit-def: $sgpr86 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr74 ; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: ; implicit-def: $sgpr83 -; GFX9-NEXT: ; implicit-def: $sgpr36 -; GFX9-NEXT: ; implicit-def: $sgpr34 -; GFX9-NEXT: ; implicit-def: $sgpr30 -; GFX9-NEXT: ; implicit-def: $sgpr94 -; GFX9-NEXT: ; implicit-def: $sgpr92 -; GFX9-NEXT: ; implicit-def: $sgpr90 -; GFX9-NEXT: ; implicit-def: $sgpr88 -; GFX9-NEXT: ; implicit-def: $sgpr78 ; GFX9-NEXT: ; implicit-def: $sgpr76 -; GFX9-NEXT: ; implicit-def: $sgpr74 -; GFX9-NEXT: ; implicit-def: $sgpr72 -; GFX9-NEXT: ; implicit-def: $sgpr62 -; GFX9-NEXT: ; implicit-def: $sgpr60 -; GFX9-NEXT: ; implicit-def: $sgpr58 -; GFX9-NEXT: ; implicit-def: $sgpr28 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr39 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr90 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr27 +; GFX9-NEXT: ; kill: killed $sgpr27 ; GFX9-NEXT: s_branch .LBB73_2 ; GFX9-NEXT: .LBB73_4: -; GFX9-NEXT: v_mov_b32_e32 v22, s62 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v23, s60 -; GFX9-NEXT: v_mov_b32_e32 v22, s58 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v22, s28 -; GFX9-NEXT: v_readlane_b32 s27, v62, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s27 -; GFX9-NEXT: v_readlane_b32 s27, v62, 1 -; GFX9-NEXT: v_mov_b32_e32 v60, s81 -; GFX9-NEXT: v_mov_b32_e32 v21, s80 -; GFX9-NEXT: v_mov_b32_e32 v57, s71 -; GFX9-NEXT: v_mov_b32_e32 v17, s70 -; GFX9-NEXT: v_mov_b32_e32 v47, s69 -; GFX9-NEXT: v_mov_b32_e32 v11, s68 -; GFX9-NEXT: v_mov_b32_e32 v46, s67 -; GFX9-NEXT: v_mov_b32_e32 v61, s66 -; GFX9-NEXT: v_mov_b32_e32 v40, s65 -; GFX9-NEXT: v_mov_b32_e32 v59, s64 -; GFX9-NEXT: v_mov_b32_e32 v55, s55 -; GFX9-NEXT: v_mov_b32_e32 v58, s54 -; GFX9-NEXT: v_mov_b32_e32 v53, s53 -; GFX9-NEXT: v_mov_b32_e32 v56, s52 -; GFX9-NEXT: v_mov_b32_e32 v49, s51 -; GFX9-NEXT: v_mov_b32_e32 v51, s50 -; GFX9-NEXT: v_mov_b32_e32 v36, s49 -; GFX9-NEXT: v_mov_b32_e32 v38, s48 -; GFX9-NEXT: v_mov_b32_e32 v32, s39 -; GFX9-NEXT: v_mov_b32_e32 v34, s38 -; GFX9-NEXT: v_mov_b32_e32 v27, s99 -; GFX9-NEXT: v_mov_b32_e32 v30, s98 -; GFX9-NEXT: v_mov_b32_e32 v18, s97 -; GFX9-NEXT: v_mov_b32_e32 v20, s96 -; GFX9-NEXT: v_mov_b32_e32 v14, s87 -; GFX9-NEXT: v_mov_b32_e32 v16, s86 -; GFX9-NEXT: v_mov_b32_e32 v10, s85 -; GFX9-NEXT: v_mov_b32_e32 v12, s84 -; GFX9-NEXT: v_mov_b32_e32 v6, s83 -; GFX9-NEXT: v_mov_b32_e32 v8, s82 -; GFX9-NEXT: v_mov_b32_e32 v4, s27 -; GFX9-NEXT: v_mov_b32_e32 v39, s56 -; GFX9-NEXT: v_mov_b32_e32 v52, s46 -; GFX9-NEXT: v_mov_b32_e32 v48, s44 -; GFX9-NEXT: v_mov_b32_e32 v37, s42 -; GFX9-NEXT: v_mov_b32_e32 v35, s40 -; GFX9-NEXT: v_mov_b32_e32 v33, s24 -; GFX9-NEXT: v_mov_b32_e32 v31, s22 -; GFX9-NEXT: v_mov_b32_e32 v28, s20 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s30 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v2, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 0 +; GFX9-NEXT: v_mov_b32_e32 v26, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 1 +; GFX9-NEXT: v_mov_b32_e32 v28, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 2 +; GFX9-NEXT: v_mov_b32_e32 v22, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 3 +; GFX9-NEXT: v_mov_b32_e32 v24, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 4 +; GFX9-NEXT: v_mov_b32_e32 v18, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 5 +; GFX9-NEXT: v_mov_b32_e32 v20, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 6 +; GFX9-NEXT: v_mov_b32_e32 v14, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 7 +; GFX9-NEXT: v_mov_b32_e32 v16, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 8 +; GFX9-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 9 +; GFX9-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 10 +; GFX9-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 11 +; GFX9-NEXT: v_mov_b32_e32 v8, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 12 +; GFX9-NEXT: v_mov_b32_e32 v52, s56 +; GFX9-NEXT: v_mov_b32_e32 v50, s46 +; GFX9-NEXT: v_mov_b32_e32 v37, s44 +; GFX9-NEXT: v_mov_b32_e32 v33, s42 +; GFX9-NEXT: v_mov_b32_e32 v29, s40 +; GFX9-NEXT: v_mov_b32_e32 v25, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s22 ; GFX9-NEXT: v_mov_b32_e32 v19, s18 -; GFX9-NEXT: v_mov_b32_e32 v15, s16 -; GFX9-NEXT: v_mov_b32_e32 v13, s14 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s14 ; GFX9-NEXT: v_mov_b32_e32 v9, s12 ; GFX9-NEXT: v_mov_b32_e32 v7, s10 ; GFX9-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v45, s36 -; GFX9-NEXT: v_mov_b32_e32 v44, s34 -; GFX9-NEXT: v_mov_b32_e32 v43, s30 -; GFX9-NEXT: v_mov_b32_e32 v42, s94 -; GFX9-NEXT: v_mov_b32_e32 v41, s92 -; GFX9-NEXT: v_mov_b32_e32 v54, s90 -; GFX9-NEXT: v_mov_b32_e32 v50, s88 -; GFX9-NEXT: v_mov_b32_e32 v29, s78 -; GFX9-NEXT: v_mov_b32_e32 v26, s76 -; GFX9-NEXT: v_mov_b32_e32 v25, s74 -; GFX9-NEXT: v_mov_b32_e32 v24, s72 -; GFX9-NEXT: v_readlane_b32 s10, v62, 2 -; GFX9-NEXT: v_readlane_b32 s12, v62, 3 -; GFX9-NEXT: v_readlane_b32 s14, v62, 4 -; GFX9-NEXT: v_readlane_b32 s16, v62, 5 -; GFX9-NEXT: v_readlane_b32 s18, v62, 6 -; GFX9-NEXT: v_readlane_b32 s20, v62, 7 -; GFX9-NEXT: v_readlane_b32 s22, v62, 8 -; GFX9-NEXT: v_readlane_b32 s24, v62, 9 -; GFX9-NEXT: v_readlane_b32 s27, v62, 11 -; GFX9-NEXT: v_readlane_b32 s28, v62, 12 -; GFX9-NEXT: v_readlane_b32 s29, v62, 13 -; GFX9-NEXT: v_readlane_b32 s40, v62, 14 -; GFX9-NEXT: v_readlane_b32 s42, v62, 15 -; GFX9-NEXT: v_readlane_b32 s44, v62, 16 -; GFX9-NEXT: v_readlane_b32 s46, v62, 17 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v22, s26 -; GFX9-NEXT: v_readlane_b32 s26, v62, 10 -; GFX9-NEXT: v_readlane_b32 s56, v62, 18 -; GFX9-NEXT: v_readlane_b32 s58, v62, 19 -; GFX9-NEXT: v_readlane_b32 s59, v62, 20 -; GFX9-NEXT: v_readlane_b32 s60, v62, 21 -; GFX9-NEXT: v_readlane_b32 s61, v62, 22 -; GFX9-NEXT: v_readlane_b32 s62, v62, 23 -; GFX9-NEXT: v_readlane_b32 s63, v62, 24 -; GFX9-NEXT: v_readlane_b32 s72, v62, 25 -; GFX9-NEXT: v_readlane_b32 s73, v62, 26 -; GFX9-NEXT: v_readlane_b32 s74, v62, 27 -; GFX9-NEXT: v_readlane_b32 s75, v62, 28 -; GFX9-NEXT: v_readlane_b32 s76, v62, 29 -; GFX9-NEXT: v_readlane_b32 s77, v62, 30 -; GFX9-NEXT: v_readlane_b32 s78, v62, 31 -; GFX9-NEXT: v_readlane_b32 s79, v62, 32 -; GFX9-NEXT: v_readlane_b32 s88, v62, 33 -; GFX9-NEXT: v_readlane_b32 s89, v62, 34 -; GFX9-NEXT: v_readlane_b32 s90, v62, 35 -; GFX9-NEXT: v_readlane_b32 s91, v62, 36 -; GFX9-NEXT: v_readlane_b32 s92, v62, 37 -; GFX9-NEXT: v_readlane_b32 s93, v62, 38 -; GFX9-NEXT: v_readlane_b32 s94, v62, 39 -; GFX9-NEXT: v_readlane_b32 s95, v62, 40 -; GFX9-NEXT: v_readlane_b32 vcc_lo, v62, 41 -; GFX9-NEXT: v_readlane_b32 vcc_hi, v62, 42 -; GFX9-NEXT: v_readlane_b32 s30, v62, 43 -; GFX9-NEXT: v_readlane_b32 s31, v62, 44 -; GFX9-NEXT: v_readlane_b32 s34, v62, 45 -; GFX9-NEXT: v_readlane_b32 s35, v62, 46 -; GFX9-NEXT: v_readlane_b32 s8, v62, 47 -; GFX9-NEXT: v_readlane_b32 s36, v62, 48 -; GFX9-NEXT: v_readlane_b32 s6, v62, 49 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v55, s26 +; GFX9-NEXT: v_mov_b32_e32 v21, s81 +; GFX9-NEXT: v_mov_b32_e32 v27, s80 +; GFX9-NEXT: v_mov_b32_e32 v46, s28 +; GFX9-NEXT: v_mov_b32_e32 v61, s99 +; GFX9-NEXT: v_mov_b32_e32 v15, s98 +; GFX9-NEXT: v_mov_b32_e32 v45, s58 +; GFX9-NEXT: v_mov_b32_e32 v59, s51 +; GFX9-NEXT: v_mov_b32_e32 v60, s97 +; GFX9-NEXT: v_mov_b32_e32 v44, s60 +; GFX9-NEXT: v_mov_b32_e32 v57, s96 +; GFX9-NEXT: v_mov_b32_e32 v58, s50 +; GFX9-NEXT: v_mov_b32_e32 v43, s62 +; GFX9-NEXT: v_mov_b32_e32 v47, s49 +; GFX9-NEXT: v_mov_b32_e32 v56, s87 +; GFX9-NEXT: v_mov_b32_e32 v42, s72 +; GFX9-NEXT: v_mov_b32_e32 v51, s86 +; GFX9-NEXT: v_mov_b32_e32 v53, s85 +; GFX9-NEXT: v_mov_b32_e32 v41, s74 +; GFX9-NEXT: v_mov_b32_e32 v38, s84 +; GFX9-NEXT: v_mov_b32_e32 v49, s48 +; GFX9-NEXT: v_mov_b32_e32 v40, s76 +; GFX9-NEXT: v_mov_b32_e32 v34, s83 +; GFX9-NEXT: v_mov_b32_e32 v36, s82 +; GFX9-NEXT: v_mov_b32_e32 v39, s78 +; GFX9-NEXT: v_mov_b32_e32 v30, s39 +; GFX9-NEXT: v_mov_b32_e32 v32, s38 +; GFX9-NEXT: v_mov_b32_e32 v54, s88 +; GFX9-NEXT: v_mov_b32_e32 v48, s90 +; GFX9-NEXT: v_mov_b32_e32 v35, s92 +; GFX9-NEXT: v_mov_b32_e32 v31, s94 +; GFX9-NEXT: v_readlane_b32 s10, v62, 14 +; GFX9-NEXT: v_readlane_b32 s12, v62, 15 +; GFX9-NEXT: v_readlane_b32 s14, v62, 16 +; GFX9-NEXT: v_readlane_b32 s16, v62, 17 +; GFX9-NEXT: v_readlane_b32 s18, v62, 18 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_readlane_b32 s20, v62, 19 +; GFX9-NEXT: v_readlane_b32 s22, v62, 20 +; GFX9-NEXT: v_readlane_b32 s24, v62, 21 +; GFX9-NEXT: v_readlane_b32 s26, v62, 22 +; GFX9-NEXT: v_readlane_b32 s27, v62, 23 +; GFX9-NEXT: v_readlane_b32 s28, v62, 24 +; GFX9-NEXT: v_readlane_b32 s29, v62, 25 +; GFX9-NEXT: v_readlane_b32 s40, v62, 26 +; GFX9-NEXT: v_readlane_b32 s42, v62, 27 +; GFX9-NEXT: v_readlane_b32 s44, v62, 28 +; GFX9-NEXT: v_readlane_b32 s46, v62, 29 +; GFX9-NEXT: v_readlane_b32 s56, v62, 30 +; GFX9-NEXT: v_readlane_b32 s58, v62, 31 +; GFX9-NEXT: v_readlane_b32 s59, v62, 32 +; GFX9-NEXT: v_readlane_b32 s60, v62, 33 +; GFX9-NEXT: v_readlane_b32 s61, v62, 34 +; GFX9-NEXT: v_readlane_b32 s62, v62, 35 +; GFX9-NEXT: v_readlane_b32 s63, v62, 36 +; GFX9-NEXT: v_readlane_b32 s72, v62, 37 +; GFX9-NEXT: v_readlane_b32 s73, v62, 38 +; GFX9-NEXT: v_readlane_b32 s74, v62, 39 +; GFX9-NEXT: v_readlane_b32 s75, v62, 40 +; GFX9-NEXT: v_readlane_b32 s76, v62, 41 +; GFX9-NEXT: v_readlane_b32 s77, v62, 42 +; GFX9-NEXT: v_readlane_b32 s78, v62, 43 +; GFX9-NEXT: v_readlane_b32 s79, v62, 44 +; GFX9-NEXT: v_readlane_b32 s88, v62, 45 +; GFX9-NEXT: v_readlane_b32 s89, v62, 46 +; GFX9-NEXT: v_readlane_b32 s90, v62, 47 +; GFX9-NEXT: v_readlane_b32 s91, v62, 48 +; GFX9-NEXT: v_readlane_b32 s92, v62, 49 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 13 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 ; GFX9-NEXT: .LBB73_5: ; %end -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v39, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v37, v37, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v35, v35, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v58 ; GFX9-NEXT: s_and_b32 s4, s57, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: v_or_b32_sdwa v33, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v56 +; GFX9-NEXT: s_lshl_b32 s6, s71, 8 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: s_and_b32 s6, s36, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v45 +; GFX9-NEXT: s_and_b32 s6, s70, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s69, 8 +; GFX9-NEXT: v_or_b32_sdwa v27, v52, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v52, 8, v55 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v39, v60, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v21, v21, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v27, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen ; GFX9-NEXT: v_mov_b32_e32 v21, s4 ; GFX9-NEXT: s_and_b32 s4, s47, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s35, 8 +; GFX9-NEXT: s_lshl_b32 s6, s68, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: s_and_b32 s6, s34, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s31, 8 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: s_and_b32 s6, s67, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s66, 8 ; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v44 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v46 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v21, v57, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v50, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v61, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: s_and_b32 s4, s45, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s30, 8 +; GFX9-NEXT: s_lshl_b32 s6, s65, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: s_and_b32 s6, vcc_hi, 0xff -; GFX9-NEXT: s_lshl_b32 s8, vcc_lo, 8 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v43 +; GFX9-NEXT: s_and_b32 s6, s64, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s55, 8 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v60 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v45 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v11, v48, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v47, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v37, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v59, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: s_and_b32 s4, s43, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s95, 8 +; GFX9-NEXT: s_lshl_b32 s6, s54, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: s_and_b32 s6, s94, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s93, 8 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v42 +; GFX9-NEXT: s_and_b32 s6, s53, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s52, 8 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v58 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v44 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v11, v46, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v57, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v11, v37, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: s_and_b32 s4, s41, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s92, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s91, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s90, 8 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v41 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v56 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v43 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v47, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v11, v35, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: s_and_b32 s4, s25, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s89, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s88, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s79, 8 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v54 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v53 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v42 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v11, v55, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v51, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: s_and_b32 s4, s23, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s78, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s77, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s76, 8 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v50 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v49 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v41 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v11, v53, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v11, v31, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_mov_b32_e32 v11, s4 -; GFX9-NEXT: s_and_b32 s4, s21, 0xff +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: s_and_b32 s4, s19, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s75, 8 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v36 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s74, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s73, 8 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v51 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v15, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v40 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v49, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: v_mov_b32_e32 v11, s4 -; GFX9-NEXT: s_and_b32 s4, s19, 0xff +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: s_and_b32 s4, s21, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s72, 8 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v32 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s63, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s62, 8 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v38 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v15, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v39 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v11, v19, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v36, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v30, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: s_and_b32 s4, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s61, 8 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v34 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v28 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s60, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s59, 8 -; GFX9-NEXT: v_or_b32_sdwa v11, v15, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v54 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v15, v32, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: v_mov_b32_e32 v13, s4 ; GFX9-NEXT: s_and_b32 s4, s15, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s58, 8 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v30 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v24 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s56, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s46, 8 -; GFX9-NEXT: v_or_b32_sdwa v11, v13, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v48 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v13, v27, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v22, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: v_mov_b32_e32 v11, s4 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v20 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: s_and_b32 s4, s13, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s44, 8 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v20 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s42, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s40, 8 ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v35 ; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-NEXT: s_and_b32 s4, s11, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s29, 8 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v31 +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: v_or_b32_sdwa v9, v14, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v12 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s4, s9, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s26, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s22, 8 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s8 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_lshl_b32 s6, s20, 8 ; GFX9-NEXT: v_readlane_b32 s99, v63, 35 ; GFX9-NEXT: v_readlane_b32 s98, v63, 34 ; GFX9-NEXT: v_readlane_b32 s97, v63, 33 @@ -120614,61 +120149,24 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v19 -; GFX9-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: v_mov_b32_e32 v9, s4 -; GFX9-NEXT: s_and_b32 s4, s11, 0xff -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v8 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s4, s7, 0xff ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: s_and_b32 s6, s28, 0xff -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v23 -; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v9, v14, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s16, 8 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: v_mov_b32_e32 v7, s4 -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v12 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: s_and_b32 s4, s9, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s26, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s22, 8 -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: s_lshl_b32 s6, s16, 8 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v11 -; GFX9-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v8 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: s_and_b32 s4, s5, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s20, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s18, 0xff -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_lshl_b32 s5, s14, 8 ; GFX9-NEXT: s_lshl_b32 s6, s10, 8 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 @@ -120681,7 +120179,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: s_and_b32 s4, s7, 0xff +; GFX9-NEXT: s_and_b32 s4, s5, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s14, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s12, 0xff ; GFX9-NEXT: s_or_b32 s5, s5, s6 @@ -120710,8 +120209,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -120816,236 +120315,236 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: s_cbranch_scc0 .LBB73_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b32 s42, s1, 24 -; GFX11-NEXT: s_lshr_b32 s53, s2, 16 -; GFX11-NEXT: v_writelane_b32 v42, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s1, 16 -; GFX11-NEXT: s_lshr_b32 s52, s2, 8 ; GFX11-NEXT: s_lshr_b32 s55, s4, 16 -; GFX11-NEXT: s_lshr_b32 s54, s4, 8 -; GFX11-NEXT: v_writelane_b32 v42, s42, 3 -; GFX11-NEXT: s_lshr_b32 s42, s1, 8 +; GFX11-NEXT: v_writelane_b32 v42, s42, 5 +; GFX11-NEXT: s_lshr_b32 s42, s1, 16 ; GFX11-NEXT: s_lshr_b32 s65, s6, 16 ; GFX11-NEXT: s_lshr_b32 s64, s6, 8 ; GFX11-NEXT: s_lshr_b32 s67, s8, 16 -; GFX11-NEXT: v_writelane_b32 v42, s42, 4 -; GFX11-NEXT: s_lshr_b32 s42, s0, 16 +; GFX11-NEXT: v_writelane_b32 v42, s42, 6 +; GFX11-NEXT: s_lshr_b32 s42, s1, 8 ; GFX11-NEXT: s_lshr_b32 s66, s8, 8 ; GFX11-NEXT: s_lshr_b32 s69, s10, 16 ; GFX11-NEXT: s_lshr_b32 s68, s10, 8 -; GFX11-NEXT: v_writelane_b32 v42, s42, 0 -; GFX11-NEXT: s_lshr_b32 s42, s0, 8 +; GFX11-NEXT: v_writelane_b32 v42, s42, 7 +; GFX11-NEXT: s_lshr_b32 s42, s0, 16 ; GFX11-NEXT: s_lshr_b32 s71, s12, 16 ; GFX11-NEXT: s_lshr_b32 s70, s12, 8 ; GFX11-NEXT: s_lshr_b32 s81, s14, 16 -; GFX11-NEXT: v_writelane_b32 v42, s42, 1 -; GFX11-NEXT: s_lshr_b32 s42, s3, 24 +; GFX11-NEXT: v_writelane_b32 v42, s42, 3 +; GFX11-NEXT: s_lshr_b32 s42, s0, 8 ; GFX11-NEXT: s_lshr_b32 s80, s14, 8 ; GFX11-NEXT: s_lshr_b32 s83, s16, 16 ; GFX11-NEXT: s_lshr_b32 s82, s16, 8 -; GFX11-NEXT: v_writelane_b32 v42, s42, 5 -; GFX11-NEXT: s_lshr_b32 s42, s3, 16 +; GFX11-NEXT: v_writelane_b32 v42, s42, 4 +; GFX11-NEXT: s_lshr_b32 s42, s3, 24 ; GFX11-NEXT: s_lshr_b32 s85, s18, 16 ; GFX11-NEXT: s_lshr_b32 s84, s18, 8 ; GFX11-NEXT: s_lshr_b32 s87, s20, 16 -; GFX11-NEXT: v_writelane_b32 v42, s42, 6 -; GFX11-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-NEXT: v_writelane_b32 v42, s42, 8 +; GFX11-NEXT: s_lshr_b32 s42, s3, 16 ; GFX11-NEXT: s_lshr_b32 s86, s20, 8 ; GFX11-NEXT: s_lshr_b32 s97, s22, 16 ; GFX11-NEXT: s_lshr_b32 s96, s22, 8 -; GFX11-NEXT: v_writelane_b32 v42, s42, 7 -; GFX11-NEXT: s_lshr_b32 s42, s5, 24 +; GFX11-NEXT: v_writelane_b32 v42, s42, 9 +; GFX11-NEXT: s_lshr_b32 s42, s3, 8 +; GFX11-NEXT: s_lshr_b32 s101, s25, 24 +; GFX11-NEXT: s_lshr_b32 s102, s25, 16 +; GFX11-NEXT: s_lshr_b32 s103, s25, 8 +; GFX11-NEXT: v_writelane_b32 v42, s42, 10 +; GFX11-NEXT: s_lshr_b32 s42, s2, 16 ; GFX11-NEXT: s_lshr_b32 s99, s24, 16 ; GFX11-NEXT: s_lshr_b32 s98, s24, 8 ; GFX11-NEXT: s_lshr_b32 s104, s27, 24 -; GFX11-NEXT: v_writelane_b32 v42, s42, 8 -; GFX11-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-NEXT: v_writelane_b32 v42, s42, 1 +; GFX11-NEXT: s_lshr_b32 s42, s2, 8 ; GFX11-NEXT: s_lshr_b32 vcc_hi, s27, 16 ; GFX11-NEXT: s_lshr_b32 s34, s27, 8 -; GFX11-NEXT: s_lshr_b32 s101, s26, 16 -; GFX11-NEXT: v_writelane_b32 v42, s42, 9 -; GFX11-NEXT: s_lshr_b32 s42, s5, 8 +; GFX11-NEXT: s_lshr_b32 s50, s26, 16 +; GFX11-NEXT: v_writelane_b32 v42, s42, 2 +; GFX11-NEXT: s_lshr_b32 s42, s5, 24 ; GFX11-NEXT: s_lshr_b32 s100, s26, 8 ; GFX11-NEXT: s_lshr_b32 s35, s29, 24 ; GFX11-NEXT: s_lshr_b32 s36, s29, 16 -; GFX11-NEXT: v_writelane_b32 v42, s42, 10 -; GFX11-NEXT: s_lshr_b32 s42, s7, 24 -; GFX11-NEXT: s_lshr_b32 s37, s29, 8 -; GFX11-NEXT: s_lshr_b32 s50, s28, 16 -; GFX11-NEXT: s_lshr_b32 s102, s28, 8 ; GFX11-NEXT: v_writelane_b32 v42, s42, 11 -; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b32 s42, s5, 16 +; GFX11-NEXT: s_lshr_b32 s37, s29, 8 +; GFX11-NEXT: s_lshr_b32 s52, s28, 16 +; GFX11-NEXT: s_lshr_b32 s51, s28, 8 +; GFX11-NEXT: v_writelane_b32 v42, s42, 12 +; GFX11-NEXT: s_lshr_b32 s42, s5, 8 ; GFX11-NEXT: s_lshr_b32 s38, s41, 24 ; GFX11-NEXT: s_lshr_b32 s39, s41, 16 ; GFX11-NEXT: s_lshr_b32 s48, s41, 8 -; GFX11-NEXT: v_writelane_b32 v42, s42, 12 -; GFX11-NEXT: s_lshr_b32 s42, s7, 8 -; GFX11-NEXT: s_lshr_b32 s103, s40, 16 -; GFX11-NEXT: s_lshr_b32 s51, s40, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[0:1], 24 ; GFX11-NEXT: v_writelane_b32 v42, s42, 13 -; GFX11-NEXT: s_lshr_b32 s42, s9, 24 -; GFX11-NEXT: s_lshr_b64 s[72:73], s[2:3], 24 -; GFX11-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 +; GFX11-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-NEXT: s_lshr_b32 s54, s40, 16 +; GFX11-NEXT: s_lshr_b32 s53, s40, 8 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[0:1], 24 +; GFX11-NEXT: v_writelane_b32 v42, s42, 0 +; GFX11-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[2:3], 24 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[4:5], 24 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[6:7], 24 ; GFX11-NEXT: v_writelane_b32 v42, s42, 14 -; GFX11-NEXT: s_lshr_b32 s42, s9, 16 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 +; GFX11-NEXT: s_lshr_b32 s42, s7, 16 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[10:11], 24 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[12:13], 24 ; GFX11-NEXT: v_writelane_b32 v42, s42, 15 -; GFX11-NEXT: s_lshr_b32 s42, s9, 8 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[16:17], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[18:19], 24 +; GFX11-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[14:15], 24 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[16:17], 24 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[18:19], 24 ; GFX11-NEXT: v_writelane_b32 v42, s42, 16 -; GFX11-NEXT: s_lshr_b32 s42, s11, 24 +; GFX11-NEXT: s_lshr_b32 s42, s9, 24 ; GFX11-NEXT: s_lshr_b64 s[60:61], s[20:21], 24 ; GFX11-NEXT: s_lshr_b64 s[58:59], s[22:23], 24 ; GFX11-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 ; GFX11-NEXT: v_writelane_b32 v42, s42, 17 -; GFX11-NEXT: s_lshr_b32 s42, s11, 16 +; GFX11-NEXT: s_lshr_b32 s42, s9, 16 ; GFX11-NEXT: s_lshr_b64 s[46:47], s[26:27], 24 ; GFX11-NEXT: s_lshr_b64 s[44:45], s[28:29], 24 ; GFX11-NEXT: v_writelane_b32 v42, s42, 18 -; GFX11-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-NEXT: s_lshr_b32 s42, s9, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v42, s42, 19 -; GFX11-NEXT: s_lshr_b32 s42, s13, 24 +; GFX11-NEXT: s_lshr_b32 s42, s11, 24 ; GFX11-NEXT: v_writelane_b32 v42, s42, 20 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 +; GFX11-NEXT: s_lshr_b32 s42, s11, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v42, s42, 21 -; GFX11-NEXT: s_lshr_b32 s42, s13, 8 +; GFX11-NEXT: s_lshr_b32 s42, s11, 8 ; GFX11-NEXT: v_writelane_b32 v42, s42, 22 -; GFX11-NEXT: s_lshr_b32 s42, s15, 24 +; GFX11-NEXT: s_lshr_b32 s42, s13, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v42, s42, 23 -; GFX11-NEXT: s_lshr_b32 s42, s15, 16 +; GFX11-NEXT: s_lshr_b32 s42, s13, 16 ; GFX11-NEXT: v_writelane_b32 v42, s42, 24 -; GFX11-NEXT: s_lshr_b32 s42, s15, 8 +; GFX11-NEXT: s_lshr_b32 s42, s13, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v42, s42, 25 -; GFX11-NEXT: s_lshr_b32 s42, s17, 24 +; GFX11-NEXT: s_lshr_b32 s42, s15, 24 ; GFX11-NEXT: v_writelane_b32 v42, s42, 26 -; GFX11-NEXT: s_lshr_b32 s42, s17, 16 +; GFX11-NEXT: s_lshr_b32 s42, s15, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v42, s42, 27 -; GFX11-NEXT: s_lshr_b32 s42, s17, 8 +; GFX11-NEXT: s_lshr_b32 s42, s15, 8 ; GFX11-NEXT: v_writelane_b32 v42, s42, 28 -; GFX11-NEXT: s_lshr_b32 s42, s19, 24 +; GFX11-NEXT: s_lshr_b32 s42, s17, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v42, s42, 29 -; GFX11-NEXT: s_lshr_b32 s42, s19, 16 +; GFX11-NEXT: s_lshr_b32 s42, s17, 16 ; GFX11-NEXT: v_writelane_b32 v42, s42, 30 -; GFX11-NEXT: s_lshr_b32 s42, s19, 8 +; GFX11-NEXT: s_lshr_b32 s42, s17, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v42, s42, 31 -; GFX11-NEXT: s_lshr_b32 s42, s21, 24 +; GFX11-NEXT: s_lshr_b32 s42, s19, 24 ; GFX11-NEXT: v_writelane_b32 v43, s42, 0 -; GFX11-NEXT: s_lshr_b32 s42, s21, 16 +; GFX11-NEXT: s_lshr_b32 s42, s19, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v43, s42, 1 -; GFX11-NEXT: s_lshr_b32 s42, s21, 8 +; GFX11-NEXT: s_lshr_b32 s42, s19, 8 ; GFX11-NEXT: v_writelane_b32 v43, s42, 2 -; GFX11-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-NEXT: s_lshr_b32 s42, s21, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v43, s42, 3 -; GFX11-NEXT: s_lshr_b32 s42, s23, 16 +; GFX11-NEXT: s_lshr_b32 s42, s21, 16 ; GFX11-NEXT: v_writelane_b32 v43, s42, 4 -; GFX11-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-NEXT: s_lshr_b32 s42, s21, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v43, s42, 5 -; GFX11-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-NEXT: s_lshr_b32 s42, s23, 24 ; GFX11-NEXT: v_writelane_b32 v43, s42, 6 -; GFX11-NEXT: s_lshr_b32 s42, s25, 16 +; GFX11-NEXT: s_lshr_b32 s42, s23, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_writelane_b32 v43, s42, 7 -; GFX11-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-NEXT: s_lshr_b32 s42, s23, 8 ; GFX11-NEXT: v_writelane_b32 v43, s42, 8 ; GFX11-NEXT: s_lshr_b64 s[42:43], s[40:41], 24 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s49 ; GFX11-NEXT: s_cbranch_vccnz .LBB73_4 ; GFX11-NEXT: .LBB73_2: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[50:51], s[24:25], 1.0 ; GFX11-NEXT: v_add_f64 v[52:53], s[26:27], 1.0 -; GFX11-NEXT: v_add_f64 v[68:69], s[40:41], 1.0 +; GFX11-NEXT: v_add_f64 v[32:33], s[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[66:67], s[40:41], 1.0 ; GFX11-NEXT: v_add_f64 v[64:65], s[28:29], 1.0 -; GFX11-NEXT: v_add_f64 v[48:49], s[22:23], 1.0 -; GFX11-NEXT: v_add_f64 v[35:36], s[20:21], 1.0 -; GFX11-NEXT: v_add_f64 v[31:32], s[18:19], 1.0 -; GFX11-NEXT: v_add_f64 v[29:30], s[16:17], 1.0 -; GFX11-NEXT: v_add_f64 v[25:26], s[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[21:22], s[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[17:18], s[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[13:14], s[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[11:12], s[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[48:49], s[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[36:37], s[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[30:31], s[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], s[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], s[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[5:6], s[4:5], 1.0 ; GFX11-NEXT: v_add_f64 v[3:4], s[2:3], 1.0 ; GFX11-NEXT: v_add_f64 v[1:2], s[0:1], 1.0 -; GFX11-NEXT: v_lshrrev_b64 v[80:81], 24, v[50:51] -; GFX11-NEXT: v_lshrrev_b64 v[81:82], 24, v[52:53] -; GFX11-NEXT: v_readfirstlane_b32 s41, v69 +; GFX11-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53] +; GFX11-NEXT: v_lshrrev_b64 v[68:69], 24, v[32:33] +; GFX11-NEXT: v_readfirstlane_b32 s41, v67 ; GFX11-NEXT: v_readfirstlane_b32 s29, v65 ; GFX11-NEXT: v_readfirstlane_b32 s27, v53 -; GFX11-NEXT: v_readfirstlane_b32 s25, v51 -; GFX11-NEXT: v_readfirstlane_b32 s23, v49 -; GFX11-NEXT: v_readfirstlane_b32 s21, v36 -; GFX11-NEXT: v_readfirstlane_b32 s19, v32 -; GFX11-NEXT: v_readfirstlane_b32 s17, v30 -; GFX11-NEXT: v_readfirstlane_b32 s15, v26 -; GFX11-NEXT: v_readfirstlane_b32 s13, v22 -; GFX11-NEXT: v_readfirstlane_b32 s11, v18 -; GFX11-NEXT: v_readfirstlane_b32 s9, v14 -; GFX11-NEXT: v_readfirstlane_b32 s7, v12 -; GFX11-NEXT: v_readfirstlane_b32 s5, v8 +; GFX11-NEXT: v_readfirstlane_b32 s25, v49 +; GFX11-NEXT: v_readfirstlane_b32 s23, v37 +; GFX11-NEXT: v_readfirstlane_b32 s21, v33 +; GFX11-NEXT: v_readfirstlane_b32 s19, v31 +; GFX11-NEXT: v_readfirstlane_b32 s17, v27 +; GFX11-NEXT: v_readfirstlane_b32 s15, v23 +; GFX11-NEXT: v_readfirstlane_b32 s13, v19 +; GFX11-NEXT: v_readfirstlane_b32 s11, v15 +; GFX11-NEXT: v_readfirstlane_b32 s9, v13 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 +; GFX11-NEXT: v_readfirstlane_b32 s5, v6 ; GFX11-NEXT: v_readfirstlane_b32 s3, v4 ; GFX11-NEXT: v_readfirstlane_b32 s1, v2 -; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[25:26] -; GFX11-NEXT: v_lshrrev_b64 v[82:83], 24, v[64:65] -; GFX11-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] -; GFX11-NEXT: v_lshrrev_b64 v[9:10], 24, v[3:4] -; GFX11-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] -; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[13:14] -; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[17:18] -; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] -; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[29:30] -; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] -; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[35:36] +; GFX11-NEXT: v_lshrrev_b64 v[9:10], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b64 v[69:70], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[81:82], 24, v[64:65] +; GFX11-NEXT: v_lshrrev_b64 v[10:11], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[16:17], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[20:21], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[24:25], 24, v[12:13] +; GFX11-NEXT: v_lshrrev_b64 v[28:29], 24, v[14:15] +; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[18:19] +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[22:23] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[26:27] +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[30:31] ; GFX11-NEXT: v_lshrrev_b64 v[70:71], 24, v[48:49] -; GFX11-NEXT: v_lshrrev_b64 v[83:84], 24, v[68:69] +; GFX11-NEXT: v_lshrrev_b64 v[82:83], 24, v[66:67] ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 8, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 8, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 8, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 8, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 8, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 8, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 8, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 8, v35 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v67, 8, v48 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v52 -; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v52 -; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v64 -; GFX11-NEXT: v_lshrrev_b32_e32 v97, 8, v64 -; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v68 -; GFX11-NEXT: v_lshrrev_b32_e32 v99, 8, v68 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 8, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 16, v64 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 8, v64 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v66 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v66 ; GFX11-NEXT: s_lshr_b32 s8, s1, 24 ; GFX11-NEXT: s_lshr_b32 s10, s1, 16 ; GFX11-NEXT: s_lshr_b32 s12, s1, 8 @@ -121082,9 +120581,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: s_lshr_b32 s79, s23, 24 ; GFX11-NEXT: s_lshr_b32 s88, s23, 16 ; GFX11-NEXT: s_lshr_b32 s89, s23, 8 -; GFX11-NEXT: s_lshr_b32 s90, s25, 24 -; GFX11-NEXT: s_lshr_b32 s91, s25, 16 -; GFX11-NEXT: s_lshr_b32 s92, s25, 8 +; GFX11-NEXT: s_lshr_b32 s101, s25, 24 +; GFX11-NEXT: s_lshr_b32 s102, s25, 16 +; GFX11-NEXT: s_lshr_b32 s103, s25, 8 ; GFX11-NEXT: s_lshr_b32 s104, s27, 24 ; GFX11-NEXT: s_lshr_b32 vcc_hi, s27, 16 ; GFX11-NEXT: s_lshr_b32 s34, s27, 8 @@ -121101,20 +120600,20 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: s_mov_b32 s49, -1 ; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; kill: killed $sgpr43 -; GFX11-NEXT: ; implicit-def: $sgpr51 -; GFX11-NEXT: ; implicit-def: $sgpr103 +; GFX11-NEXT: ; implicit-def: $sgpr53 +; GFX11-NEXT: ; implicit-def: $sgpr54 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr48 ; GFX11-NEXT: ; implicit-def: $sgpr39 ; GFX11-NEXT: ; implicit-def: $sgpr38 -; GFX11-NEXT: ; implicit-def: $sgpr102 -; GFX11-NEXT: ; implicit-def: $sgpr50 +; GFX11-NEXT: ; implicit-def: $sgpr51 +; GFX11-NEXT: ; implicit-def: $sgpr52 ; GFX11-NEXT: ; implicit-def: $sgpr44 ; GFX11-NEXT: ; implicit-def: $sgpr37 ; GFX11-NEXT: ; implicit-def: $sgpr36 ; GFX11-NEXT: ; implicit-def: $sgpr35 ; GFX11-NEXT: ; implicit-def: $sgpr100 -; GFX11-NEXT: ; implicit-def: $sgpr101 +; GFX11-NEXT: ; implicit-def: $sgpr50 ; GFX11-NEXT: ; implicit-def: $sgpr46 ; GFX11-NEXT: ; implicit-def: $sgpr34 ; GFX11-NEXT: ; implicit-def: $vcc_hi @@ -121122,6 +120621,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: ; implicit-def: $sgpr98 ; GFX11-NEXT: ; implicit-def: $sgpr99 ; GFX11-NEXT: ; implicit-def: $sgpr56 +; GFX11-NEXT: ; implicit-def: $sgpr103 +; GFX11-NEXT: ; implicit-def: $sgpr102 +; GFX11-NEXT: ; implicit-def: $sgpr101 ; GFX11-NEXT: ; implicit-def: $sgpr96 ; GFX11-NEXT: ; implicit-def: $sgpr97 ; GFX11-NEXT: ; implicit-def: $sgpr58 @@ -121130,32 +120632,29 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: ; implicit-def: $sgpr60 ; GFX11-NEXT: ; implicit-def: $sgpr84 ; GFX11-NEXT: ; implicit-def: $sgpr85 +; GFX11-NEXT: ; implicit-def: $sgpr62 ; GFX11-NEXT: ; implicit-def: $sgpr82 ; GFX11-NEXT: ; implicit-def: $sgpr83 +; GFX11-NEXT: ; implicit-def: $sgpr72 ; GFX11-NEXT: ; implicit-def: $sgpr80 ; GFX11-NEXT: ; implicit-def: $sgpr81 +; GFX11-NEXT: ; implicit-def: $sgpr74 ; GFX11-NEXT: ; implicit-def: $sgpr70 ; GFX11-NEXT: ; implicit-def: $sgpr71 +; GFX11-NEXT: ; implicit-def: $sgpr76 ; GFX11-NEXT: ; implicit-def: $sgpr68 ; GFX11-NEXT: ; implicit-def: $sgpr69 +; GFX11-NEXT: ; implicit-def: $sgpr78 ; GFX11-NEXT: ; implicit-def: $sgpr66 ; GFX11-NEXT: ; implicit-def: $sgpr67 +; GFX11-NEXT: ; implicit-def: $sgpr88 ; GFX11-NEXT: ; implicit-def: $sgpr64 ; GFX11-NEXT: ; implicit-def: $sgpr65 -; GFX11-NEXT: ; implicit-def: $sgpr54 +; GFX11-NEXT: ; implicit-def: $sgpr90 ; GFX11-NEXT: ; implicit-def: $sgpr55 -; GFX11-NEXT: ; implicit-def: $sgpr52 -; GFX11-NEXT: ; implicit-def: $sgpr53 -; GFX11-NEXT: ; implicit-def: $sgpr30 -; GFX11-NEXT: ; implicit-def: $sgpr94 ; GFX11-NEXT: ; implicit-def: $sgpr92 -; GFX11-NEXT: ; implicit-def: $sgpr90 -; GFX11-NEXT: ; implicit-def: $sgpr88 -; GFX11-NEXT: ; implicit-def: $sgpr78 -; GFX11-NEXT: ; implicit-def: $sgpr76 -; GFX11-NEXT: ; implicit-def: $sgpr74 -; GFX11-NEXT: ; implicit-def: $sgpr72 -; GFX11-NEXT: ; implicit-def: $sgpr62 +; GFX11-NEXT: ; implicit-def: $sgpr94 +; GFX11-NEXT: ; implicit-def: $sgpr30 ; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; kill: killed $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr43 @@ -121236,80 +120735,82 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: ; kill: killed $sgpr43 ; GFX11-NEXT: s_branch .LBB73_2 ; GFX11-NEXT: .LBB73_4: -; GFX11-NEXT: v_readlane_b32 s43, v42, 0 -; GFX11-NEXT: v_dual_mov_b32 v98, s103 :: v_dual_mov_b32 v99, s51 -; GFX11-NEXT: v_dual_mov_b32 v96, s50 :: v_dual_mov_b32 v97, s102 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v2, s43 :: v_dual_mov_b32 v7, s4 -; GFX11-NEXT: v_readlane_b32 s43, v42, 1 -; GFX11-NEXT: v_dual_mov_b32 v86, s101 :: v_dual_mov_b32 v87, s100 -; GFX11-NEXT: v_dual_mov_b32 v71, s99 :: v_dual_mov_b32 v34, s83 -; GFX11-NEXT: v_dual_mov_b32 v85, s98 :: v_dual_mov_b32 v36, s82 -; GFX11-NEXT: v_dual_mov_b32 v55, s97 :: v_dual_mov_b32 v30, s81 -; GFX11-NEXT: v_dual_mov_b32 v67, s96 :: v_dual_mov_b32 v32, s80 -; GFX11-NEXT: v_dual_mov_b32 v51, s87 :: v_dual_mov_b32 v26, s71 -; GFX11-NEXT: v_dual_mov_b32 v53, s86 :: v_dual_mov_b32 v28, s70 -; GFX11-NEXT: v_dual_mov_b32 v39, s85 :: v_dual_mov_b32 v22, s69 -; GFX11-NEXT: v_dual_mov_b32 v49, s84 :: v_dual_mov_b32 v24, s68 -; GFX11-NEXT: v_dual_mov_b32 v18, s67 :: v_dual_mov_b32 v35, s20 -; GFX11-NEXT: v_dual_mov_b32 v20, s66 :: v_dual_mov_b32 v31, s18 -; GFX11-NEXT: v_dual_mov_b32 v14, s65 :: v_dual_mov_b32 v29, s16 -; GFX11-NEXT: v_dual_mov_b32 v16, s64 :: v_dual_mov_b32 v25, s14 -; GFX11-NEXT: v_dual_mov_b32 v10, s55 :: v_dual_mov_b32 v21, s12 -; GFX11-NEXT: v_dual_mov_b32 v12, s54 :: v_dual_mov_b32 v17, s10 -; GFX11-NEXT: v_dual_mov_b32 v6, s53 :: v_dual_mov_b32 v13, s8 -; GFX11-NEXT: v_dual_mov_b32 v8, s52 :: v_dual_mov_b32 v11, s6 -; GFX11-NEXT: v_dual_mov_b32 v4, s43 :: v_dual_mov_b32 v3, s2 -; GFX11-NEXT: v_dual_mov_b32 v68, s40 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v64, s28 :: v_dual_mov_b32 v83, s42 -; GFX11-NEXT: v_dual_mov_b32 v52, s26 :: v_dual_mov_b32 v81, s46 -; GFX11-NEXT: v_dual_mov_b32 v50, s24 :: v_dual_mov_b32 v37, s92 -; GFX11-NEXT: v_dual_mov_b32 v48, s22 :: v_dual_mov_b32 v33, s90 -; GFX11-NEXT: v_dual_mov_b32 v82, s44 :: v_dual_mov_b32 v27, s88 -; GFX11-NEXT: v_dual_mov_b32 v80, s56 :: v_dual_mov_b32 v23, s78 -; GFX11-NEXT: v_dual_mov_b32 v70, s58 :: v_dual_mov_b32 v19, s76 -; GFX11-NEXT: v_dual_mov_b32 v66, s60 :: v_dual_mov_b32 v15, s74 -; GFX11-NEXT: v_dual_mov_b32 v54, s30 :: v_dual_mov_b32 v9, s72 -; GFX11-NEXT: v_dual_mov_b32 v38, s94 :: v_dual_mov_b32 v5, s62 -; GFX11-NEXT: v_readlane_b32 s8, v42, 2 -; GFX11-NEXT: v_readlane_b32 s10, v42, 3 -; GFX11-NEXT: v_readlane_b32 s12, v42, 4 -; GFX11-NEXT: v_readlane_b32 s14, v42, 5 -; GFX11-NEXT: v_readlane_b32 s16, v42, 6 -; GFX11-NEXT: v_readlane_b32 s18, v42, 7 -; GFX11-NEXT: v_readlane_b32 s20, v42, 8 -; GFX11-NEXT: v_readlane_b32 s22, v42, 9 -; GFX11-NEXT: v_readlane_b32 s24, v42, 10 -; GFX11-NEXT: v_readlane_b32 s26, v42, 11 -; GFX11-NEXT: v_readlane_b32 s28, v42, 12 -; GFX11-NEXT: v_readlane_b32 s40, v42, 13 -; GFX11-NEXT: v_readlane_b32 s42, v42, 14 -; GFX11-NEXT: v_readlane_b32 s43, v42, 15 -; GFX11-NEXT: v_readlane_b32 s44, v42, 16 -; GFX11-NEXT: v_readlane_b32 s45, v42, 17 -; GFX11-NEXT: v_readlane_b32 s46, v42, 18 -; GFX11-NEXT: v_readlane_b32 s47, v42, 19 -; GFX11-NEXT: v_readlane_b32 s56, v42, 20 -; GFX11-NEXT: v_readlane_b32 s57, v42, 21 -; GFX11-NEXT: v_readlane_b32 s58, v42, 22 -; GFX11-NEXT: v_readlane_b32 s59, v42, 23 -; GFX11-NEXT: v_readlane_b32 s60, v42, 24 -; GFX11-NEXT: v_readlane_b32 s61, v42, 25 -; GFX11-NEXT: v_readlane_b32 s62, v42, 26 -; GFX11-NEXT: v_readlane_b32 s63, v42, 27 -; GFX11-NEXT: v_readlane_b32 s72, v42, 28 -; GFX11-NEXT: v_readlane_b32 s73, v42, 29 -; GFX11-NEXT: v_readlane_b32 s74, v42, 30 -; GFX11-NEXT: v_readlane_b32 s75, v42, 31 -; GFX11-NEXT: v_readlane_b32 s76, v43, 0 -; GFX11-NEXT: v_readlane_b32 s77, v43, 1 -; GFX11-NEXT: v_readlane_b32 s78, v43, 2 -; GFX11-NEXT: v_readlane_b32 s79, v43, 3 -; GFX11-NEXT: v_readlane_b32 s88, v43, 4 -; GFX11-NEXT: v_readlane_b32 s89, v43, 5 -; GFX11-NEXT: v_readlane_b32 s90, v43, 6 -; GFX11-NEXT: v_readlane_b32 s91, v43, 7 -; GFX11-NEXT: v_readlane_b32 s92, v43, 8 +; GFX11-NEXT: v_dual_mov_b32 v48, s24 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_readlane_b32 s0, v42, 0 +; GFX11-NEXT: v_dual_mov_b32 v66, s40 :: v_dual_mov_b32 v7, s6 +; GFX11-NEXT: v_dual_mov_b32 v64, s28 :: v_dual_mov_b32 v5, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v13, s0 +; GFX11-NEXT: v_readlane_b32 s0, v42, 1 +; GFX11-NEXT: v_dual_mov_b32 v17, s64 :: v_dual_mov_b32 v52, s26 +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v36, s22 +; GFX11-NEXT: v_dual_mov_b32 v87, s54 :: v_dual_mov_b32 v6, s0 +; GFX11-NEXT: v_readlane_b32 s0, v42, 2 +; GFX11-NEXT: v_dual_mov_b32 v11, s55 :: v_dual_mov_b32 v32, s20 +; GFX11-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v85, s52 :: v_dual_mov_b32 v8, s0 +; GFX11-NEXT: v_readlane_b32 s0, v42, 3 +; GFX11-NEXT: v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v83, s50 +; GFX11-NEXT: v_dual_mov_b32 v22, s14 :: v_dual_mov_b32 v67, s99 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: v_readlane_b32 s0, v42, 4 +; GFX11-NEXT: v_dual_mov_b32 v18, s12 :: v_dual_mov_b32 v71, s98 +; GFX11-NEXT: v_dual_mov_b32 v14, s10 :: v_dual_mov_b32 v69, s58 +; GFX11-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v55, s97 +; GFX11-NEXT: v_dual_mov_b32 v82, s42 :: v_dual_mov_b32 v65, s96 +; GFX11-NEXT: v_dual_mov_b32 v96, s53 :: v_dual_mov_b32 v51, s87 +; GFX11-NEXT: v_dual_mov_b32 v86, s51 :: v_dual_mov_b32 v53, s86 +; GFX11-NEXT: v_dual_mov_b32 v80, s46 :: v_dual_mov_b32 v39, s85 +; GFX11-NEXT: v_dual_mov_b32 v84, s100 :: v_dual_mov_b32 v49, s84 +; GFX11-NEXT: v_dual_mov_b32 v70, s56 :: v_dual_mov_b32 v35, s83 +; GFX11-NEXT: v_dual_mov_b32 v68, s60 :: v_dual_mov_b32 v37, s82 +; GFX11-NEXT: v_dual_mov_b32 v54, s62 :: v_dual_mov_b32 v31, s81 +; GFX11-NEXT: v_dual_mov_b32 v50, s72 :: v_dual_mov_b32 v33, s80 +; GFX11-NEXT: v_dual_mov_b32 v38, s74 :: v_dual_mov_b32 v27, s71 +; GFX11-NEXT: v_dual_mov_b32 v34, s76 :: v_dual_mov_b32 v29, s70 +; GFX11-NEXT: v_dual_mov_b32 v28, s78 :: v_dual_mov_b32 v23, s69 +; GFX11-NEXT: v_dual_mov_b32 v25, s68 :: v_dual_mov_b32 v24, s88 +; GFX11-NEXT: v_dual_mov_b32 v19, s67 :: v_dual_mov_b32 v20, s90 +; GFX11-NEXT: v_dual_mov_b32 v21, s66 :: v_dual_mov_b32 v16, s92 +; GFX11-NEXT: v_dual_mov_b32 v15, s65 :: v_dual_mov_b32 v10, s94 +; GFX11-NEXT: v_dual_mov_b32 v9, s30 :: v_dual_mov_b32 v4, s0 +; GFX11-NEXT: v_readlane_b32 s8, v42, 5 +; GFX11-NEXT: v_readlane_b32 s10, v42, 6 +; GFX11-NEXT: v_readlane_b32 s12, v42, 7 +; GFX11-NEXT: v_readlane_b32 s14, v42, 8 +; GFX11-NEXT: v_readlane_b32 s16, v42, 9 +; GFX11-NEXT: v_readlane_b32 s18, v42, 10 +; GFX11-NEXT: v_readlane_b32 s20, v42, 11 +; GFX11-NEXT: v_readlane_b32 s22, v42, 12 +; GFX11-NEXT: v_readlane_b32 s24, v42, 13 +; GFX11-NEXT: v_readlane_b32 s26, v42, 14 +; GFX11-NEXT: v_readlane_b32 s28, v42, 15 +; GFX11-NEXT: v_readlane_b32 s40, v42, 16 +; GFX11-NEXT: v_readlane_b32 s42, v42, 17 +; GFX11-NEXT: v_readlane_b32 s43, v42, 18 +; GFX11-NEXT: v_readlane_b32 s44, v42, 19 +; GFX11-NEXT: v_readlane_b32 s45, v42, 20 +; GFX11-NEXT: v_readlane_b32 s46, v42, 21 +; GFX11-NEXT: v_readlane_b32 s47, v42, 22 +; GFX11-NEXT: v_readlane_b32 s56, v42, 23 +; GFX11-NEXT: v_readlane_b32 s57, v42, 24 +; GFX11-NEXT: v_readlane_b32 s58, v42, 25 +; GFX11-NEXT: v_readlane_b32 s59, v42, 26 +; GFX11-NEXT: v_readlane_b32 s60, v42, 27 +; GFX11-NEXT: v_readlane_b32 s61, v42, 28 +; GFX11-NEXT: v_readlane_b32 s62, v42, 29 +; GFX11-NEXT: v_readlane_b32 s63, v42, 30 +; GFX11-NEXT: v_readlane_b32 s72, v42, 31 +; GFX11-NEXT: v_readlane_b32 s73, v43, 0 +; GFX11-NEXT: v_readlane_b32 s74, v43, 1 +; GFX11-NEXT: v_readlane_b32 s75, v43, 2 +; GFX11-NEXT: v_readlane_b32 s76, v43, 3 +; GFX11-NEXT: v_readlane_b32 s77, v43, 4 +; GFX11-NEXT: v_readlane_b32 s78, v43, 5 +; GFX11-NEXT: v_readlane_b32 s79, v43, 6 +; GFX11-NEXT: v_readlane_b32 s88, v43, 7 +; GFX11-NEXT: v_readlane_b32 s89, v43, 8 ; GFX11-NEXT: .LBB73_5: ; %end ; GFX11-NEXT: s_and_b32 s0, s41, 0xff ; GFX11-NEXT: s_lshl_b32 s2, s48, 8 @@ -121318,313 +120819,313 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: s_and_b32 s2, s39, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v68 +; GFX11-NEXT: s_lshl_b32 s4, s37, 8 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v99 +; GFX11-NEXT: s_lshl_b32 s6, s35, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s2, s29, 0xff -; GFX11-NEXT: s_lshl_b32 s4, s37, 8 -; GFX11-NEXT: s_lshl_b32 s6, s35, 8 +; GFX11-NEXT: v_dual_mov_b32 v97, s0 :: v_dual_and_b32 v66, 0xff, v66 ; GFX11-NEXT: s_or_b32 s2, s2, s4 ; GFX11-NEXT: s_and_b32 s4, s36, 0xff -; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v83 -; GFX11-NEXT: s_or_b32 s4, s4, s6 -; GFX11-NEXT: v_or_b32_e32 v65, v65, v69 -; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v98 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_or_b32 s4, s4, s6 +; GFX11-NEXT: s_and_b32 s0, s27, 0xff ; GFX11-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-NEXT: s_lshl_b32 s6, s90, 8 +; GFX11-NEXT: s_lshl_b32 s6, s101, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: v_or_b32_e32 v68, v69, v68 -; GFX11-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_and_b32 v64, 0xff, v64 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v97 -; GFX11-NEXT: v_mov_b32_e32 v97, s0 -; GFX11-NEXT: s_and_b32 s0, s27, 0xff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v99, s2 :: v_dual_lshlrev_b32 v96, 8, v96 ; GFX11-NEXT: s_lshl_b32 s2, s34, 8 ; GFX11-NEXT: s_lshl_b32 s4, s104, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s2, vcc_hi, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: s_lshl_b32 s4, s92, 8 +; GFX11-NEXT: s_lshl_b32 s4, s103, 8 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_and_b32_e32 v65, 0xffff, v65 +; GFX11-NEXT: v_and_b32_e32 v87, 0xff, v87 ; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s2, s25, 0xff -; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v82 ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: s_and_b32 s4, s91, 0xff +; GFX11-NEXT: s_and_b32 s4, s102, 0xff ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_or_b32 s4, s4, s6 -; GFX11-NEXT: v_and_b32_e32 v83, 0xff, v96 +; GFX11-NEXT: v_lshlrev_b32_e32 v86, 8, v86 ; GFX11-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-NEXT: v_or_b32_e32 v64, v64, v69 +; GFX11-NEXT: v_and_b32_e32 v85, 0xff, v85 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v81 ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v82 -; GFX11-NEXT: v_or_b32_e32 v96, v65, v68 -; GFX11-NEXT: v_dual_mov_b32 v81, s0 :: v_dual_lshlrev_b32 v68, 8, v81 -; GFX11-NEXT: s_and_b32 s0, s23, 0xff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_or_b32_e32 v69, v83, v82 -; GFX11-NEXT: v_mov_b32_e32 v83, s2 -; GFX11-NEXT: s_lshl_b32 s2, s89, 8 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v64 +; GFX11-NEXT: v_or_b32_e32 v66, v66, v96 +; GFX11-NEXT: v_or_b32_e32 v82, v87, v82 +; GFX11-NEXT: v_or_b32_e32 v81, v85, v81 +; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v52 +; GFX11-NEXT: v_or_b32_e32 v64, v64, v86 +; GFX11-NEXT: v_and_b32_e32 v66, 0xffff, v66 +; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v82 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v81 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v64, 0xffff, v64 +; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-NEXT: v_or_b32_e32 v96, v66, v82 +; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v83 +; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v67 +; GFX11-NEXT: v_or_b32_e32 v98, v64, v81 +; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v84 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v70 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_or_b32_e32 v52, v52, v64 +; GFX11-NEXT: v_or_b32_e32 v64, v66, v80 +; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v71 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v35 ; GFX11-NEXT: s_lshl_b32 s4, s79, 8 +; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v52 +; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v64 +; GFX11-NEXT: v_or_b32_e32 v48, v48, v66 +; GFX11-NEXT: v_or_b32_e32 v66, v67, v70 +; GFX11-NEXT: v_mov_b32_e32 v67, s2 +; GFX11-NEXT: s_lshl_b32 s2, s89, 8 +; GFX11-NEXT: v_or_b32_e32 v64, v52, v64 +; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v66 +; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 8, v69 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_or_b32_e32 v66, v48, v66 +; GFX11-NEXT: v_dual_mov_b32 v65, s0 :: v_dual_lshlrev_b32 v48, 8, v65 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[96:99], off +; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:16 +; GFX11-NEXT: v_or_b32_e32 v36, v36, v48 +; GFX11-NEXT: v_or_b32_e32 v48, v52, v55 +; GFX11-NEXT: s_and_b32 s0, s23, 0xff +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v33 ; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v48 ; GFX11-NEXT: s_and_b32 s2, s88, 0xff -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31 ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: s_and_b32 s4, s21, 0xff +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: v_or_b32_e32 v64, v36, v48 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v53 +; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v51 +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v68 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_and_b32 s4, s21, 0xff +; GFX11-NEXT: v_or_b32_e32 v32, v32, v36 ; GFX11-NEXT: s_lshl_b32 s6, s78, 8 +; GFX11-NEXT: v_or_b32_e32 v36, v48, v51 ; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: s_or_b32 s2, s4, s6 +; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 ; GFX11-NEXT: s_and_b32 s4, s77, 0xff +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v36 ; GFX11-NEXT: s_lshl_b32 s6, s76, 8 -; GFX11-NEXT: v_and_b32_e32 v64, 0xffff, v64 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v69 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v33 ; GFX11-NEXT: s_or_b32 s4, s4, s6 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: v_or_b32_e32 v66, v32, v36 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v49 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v39 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 8, v54 ; GFX11-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v86 -; GFX11-NEXT: v_or_b32_e32 v98, v64, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v87 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v32 ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v52 -; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v50 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v80 -; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v48 -; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v35 -; GFX11-NEXT: v_or_b32_e32 v52, v52, v64 -; GFX11-NEXT: v_or_b32_e32 v64, v65, v68 -; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v85 -; GFX11-NEXT: v_and_b32_e32 v68, 0xff, v71 -; GFX11-NEXT: v_and_b32_e32 v51, 0xff, v51 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v52 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v64 -; GFX11-NEXT: v_or_b32_e32 v50, v50, v65 -; GFX11-NEXT: v_or_b32_e32 v65, v68, v69 -; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GFX11-NEXT: v_or_b32_e32 v80, v52, v64 -; GFX11-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; GFX11-NEXT: v_lshlrev_b32_e32 v65, 16, v65 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v55 -; GFX11-NEXT: v_lshlrev_b32_e32 v55, 8, v70 -; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v36 -; GFX11-NEXT: v_or_b32_e32 v82, v50, v65 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v67 -; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GFX11-NEXT: v_lshlrev_b32_e32 v38, 8, v38 -; GFX11-NEXT: v_or_b32_e32 v36, v29, v36 -; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-NEXT: v_or_b32_e32 v48, v48, v50 -; GFX11-NEXT: v_or_b32_e32 v50, v52, v55 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v66 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v32 -; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 -; GFX11-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-NEXT: s_lshl_b32 s4, s73, 8 -; GFX11-NEXT: v_or_b32_e32 v34, v34, v38 -; GFX11-NEXT: v_or_b32_e32 v25, v25, v32 -; GFX11-NEXT: s_lshl_b32 s6, s62, 8 -; GFX11-NEXT: v_or_b32_e32 v48, v48, v50 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26 -; GFX11-NEXT: v_or_b32_e32 v35, v35, v50 -; GFX11-NEXT: v_or_b32_e32 v50, v51, v52 -; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v54 -; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 -; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 -; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-NEXT: v_or_b32_e32 v50, v35, v50 -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 8, v49 -; GFX11-NEXT: v_mov_b32_e32 v49, s0 +; GFX11-NEXT: v_or_b32_e32 v32, v36, v39 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 8, v37 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 8, v50 +; GFX11-NEXT: v_dual_mov_b32 v65, s0 :: v_dual_and_b32 v30, 0xffff, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v35, v35, v37 +; GFX11-NEXT: v_mov_b32_e32 v67, s2 ; GFX11-NEXT: s_and_b32 s0, s19, 0xff -; GFX11-NEXT: v_or_b32_e32 v13, v13, v20 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GFX11-NEXT: v_or_b32_e32 v31, v31, v35 -; GFX11-NEXT: v_or_b32_e32 v35, v39, v51 -; GFX11-NEXT: v_mov_b32_e32 v51, s2 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v32 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v35, 8, v38 ; GFX11-NEXT: s_lshl_b32 s2, s75, 8 -; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GFX11-NEXT: s_lshl_b32 s4, s73, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s2, s74, 0xff -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: v_or_b32_e32 v31, v31, v35 ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: v_or_b32_e32 v29, v31, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 8, v37 +; GFX11-NEXT: v_or_b32_e32 v32, v26, v32 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 8, v34 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_lshl_b32 s4, s72, 8 +; GFX11-NEXT: v_or_b32_e32 v35, v22, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v29 ; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s2, s17, 0xff -; GFX11-NEXT: v_or_b32_e32 v30, v30, v35 -; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v36 +; GFX11-NEXT: s_lshl_b32 s6, s62, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v22 +; GFX11-NEXT: v_or_b32_e32 v22, v26, v27 ; GFX11-NEXT: s_and_b32 s4, s63, 0xff ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v30 ; GFX11-NEXT: s_or_b32 s4, s4, s6 -; GFX11-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; GFX11-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-NEXT: v_mov_b32_e32 v30, s0 -; GFX11-NEXT: v_or_b32_e32 v34, v25, v35 -; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: v_mov_b32_e32 v31, s0 +; GFX11-NEXT: v_mov_b32_e32 v33, s2 ; GFX11-NEXT: s_and_b32 s0, s15, 0xff -; GFX11-NEXT: v_mov_b32_e32 v32, s2 ; GFX11-NEXT: s_lshl_b32 s2, s61, 8 -; GFX11-NEXT: v_or_b32_e32 v21, v21, v25 -; GFX11-NEXT: v_or_b32_e32 v25, v26, v28 +; GFX11-NEXT: v_or_b32_e32 v37, v18, v22 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v25 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v28 ; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s2, s60, 0xff ; GFX11-NEXT: s_lshl_b32 s4, s59, 8 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; GFX11-NEXT: s_and_b32 s4, s13, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_lshl_b32 s6, s58, 8 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v18 +; GFX11-NEXT: v_or_b32_e32 v18, v22, v23 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v24 ; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: s_or_b32 s2, s4, s6 ; GFX11-NEXT: s_and_b32 s4, s57, 0xff ; GFX11-NEXT: s_lshl_b32 s6, s56, 8 -; GFX11-NEXT: v_or_b32_e32 v36, v21, v25 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; GFX11-NEXT: s_or_b32 s4, s4, s6 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v24 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v22 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-NEXT: v_mov_b32_e32 v35, s0 +; GFX11-NEXT: v_mov_b32_e32 v36, s0 ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: v_or_b32_e32 v17, v17, v21 -; GFX11-NEXT: v_or_b32_e32 v21, v22, v24 -; GFX11-NEXT: v_mov_b32_e32 v37, s2 ; GFX11-NEXT: s_and_b32 s0, s11, 0xff +; GFX11-NEXT: v_mov_b32_e32 v38, s2 ; GFX11-NEXT: s_lshl_b32 s2, s47, 8 -; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v23 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX11-NEXT: v_or_b32_e32 v18, v14, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v20 ; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s2, s46, 0xff ; GFX11-NEXT: s_lshl_b32 s4, s45, 8 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: v_or_b32_e32 v12, v12, v21 ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: v_or_b32_e32 v18, v18, v22 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v17 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v19 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_lshl_b32 s4, s44, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s2, s9, 0xff -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-NEXT: s_or_b32 s2, s2, s4 ; GFX11-NEXT: s_and_b32 s4, s43, 0xff ; GFX11-NEXT: s_lshl_b32 s6, s42, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; GFX11-NEXT: s_or_b32 s4, s4, s6 -; GFX11-NEXT: v_or_b32_e32 v14, v14, v19 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-NEXT: v_or_b32_e32 v19, v13, v18 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v15 +; GFX11-NEXT: v_or_b32_e32 v20, v12, v14 +; GFX11-NEXT: v_or_b32_e32 v12, v7, v15 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v16 ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: v_mov_b32_e32 v18, s0 -; GFX11-NEXT: v_mov_b32_e32 v20, s2 +; GFX11-NEXT: v_mov_b32_e32 v19, s0 +; GFX11-NEXT: v_mov_b32_e32 v21, s2 ; GFX11-NEXT: s_and_b32 s0, s7, 0xff ; GFX11-NEXT: s_lshl_b32 s2, s40, 8 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX11-NEXT: s_lshl_b32 s4, s26, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s2, s28, 0xff -; GFX11-NEXT: s_lshl_b32 s4, s26, 8 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v12 -; GFX11-NEXT: v_or_b32_e32 v10, v10, v13 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX11-NEXT: v_or_b32_e32 v7, v11, v13 ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: v_or_b32_e32 v11, v11, v16 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_and_b32 s4, s5, 0xff ; GFX11-NEXT: s_lshl_b32 s5, s24, 8 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: s_or_b32 s2, s4, s5 ; GFX11-NEXT: s_and_b32 s4, s22, 0xff ; GFX11-NEXT: s_lshl_b32 s5, s20, 8 -; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_or_b32_e32 v14, v5, v7 ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_or_b32_e32 v13, v7, v10 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v8 ; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 8, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 8, v10 ; GFX11-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-NEXT: v_or_b32_e32 v11, v11, v14 +; GFX11-NEXT: v_mov_b32_e32 v13, s0 ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: v_mov_b32_e32 v12, s0 -; GFX11-NEXT: v_mov_b32_e32 v14, s2 ; GFX11-NEXT: s_and_b32 s0, s3, 0xff +; GFX11-NEXT: v_mov_b32_e32 v15, s2 ; GFX11-NEXT: s_lshl_b32 s2, s18, 8 -; GFX11-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX11-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v9 ; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s2, s16, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s14, 8 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: v_or_b32_e32 v4, v1, v4 -; GFX11-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s2 ; GFX11-NEXT: s_lshl_b32 s2, s12, 8 ; GFX11-NEXT: s_and_b32 s3, s10, 0xff ; GFX11-NEXT: s_lshl_b32 s4, s8, 8 -; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX11-NEXT: s_or_b32 s1, s1, s2 ; GFX11-NEXT: s_or_b32 s2, s3, s4 -; GFX11-NEXT: v_or_b32_e32 v1, v3, v6 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v5 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_or_b32_e32 v17, v17, v21 +; GFX11-NEXT: v_readlane_b32 s104, v41, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[96:99], off -; GFX11-NEXT: scratch_store_b128 v0, v[80:83], off offset:16 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: v_mov_b32_e32 v4, s1 ; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:48 -; GFX11-NEXT: scratch_store_b128 v0, v[34:37], off offset:64 -; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:80 -; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[30:33], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[35:38], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[18:21], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[12:15], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: v_readlane_b32 s104, v41, 8 ; GFX11-NEXT: v_readlane_b32 s103, v41, 7 ; GFX11-NEXT: v_readlane_b32 s102, v41, 6 ; GFX11-NEXT: v_readlane_b32 s101, v41, 5 @@ -121711,22 +121212,22 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 @@ -121745,33 +121246,33 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:152 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:160 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:184 -; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v34, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 8, v13 -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v17 -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v25 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v29 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v21 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v55, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -121828,19 +121329,19 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v32 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v36 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 @@ -121848,19 +121349,19 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -121871,7 +121372,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 @@ -121879,27 +121380,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 @@ -121908,15 +121409,15 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 @@ -121930,24 +121431,24 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 @@ -121956,29 +121457,26 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:268 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 @@ -121987,29 +121485,29 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 @@ -122018,29 +121516,29 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 @@ -122049,239 +121547,307 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB74_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v54 -; SI-NEXT: v_or_b32_e32 v2, v2, v41 +; SI-NEXT: v_or_b32_e32 v0, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: v_or_b32_e32 v3, v40, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v4, v45, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 +; SI-NEXT: v_or_b32_e32 v5, v41, v5 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 +; SI-NEXT: v_or_b32_e32 v6, v62, v6 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v61, v8 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v52 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v55, v8 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v36 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v62 +; SI-NEXT: v_or_b32_e32 v4, v4, v43 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v5, v5, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v63 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v6, v56 +; SI-NEXT: v_or_b32_e32 v6, v6, v34 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v7, v46 +; SI-NEXT: v_or_b32_e32 v7, v7, v38 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v35 -; SI-NEXT: v_or_b32_e32 v8, v8, v44 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v8, v8, v55 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v46 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v51 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v63 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v44 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v42 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v59 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v35 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v51 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v40 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v47 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v45 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -122299,204 +121865,299 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v59 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 ; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 ; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; SI-NEXT: v_or_b32_e32 v31, v31, v43 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v32, v33, v32 ; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -122508,351 +122169,189 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; kill: killed $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: .LBB74_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB74_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v40, v2 +; SI-NEXT: v_or_b32_e32 v2, v53, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v34, v3 +; SI-NEXT: v_or_b32_e32 v3, v47, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v45, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v60, v5 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v5, v41, v5 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v58, v6 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_or_b32_e32 v6, v62, v6 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v61, v8 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v55, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v2, v53, v2 +; SI-NEXT: v_or_b32_e32 v2, v52, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v48, v3 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_or_b32_e32 v4, v43, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v5, v38, v5 +; SI-NEXT: v_or_b32_e32 v5, v63, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v56, v6 +; SI-NEXT: v_or_b32_e32 v6, v34, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v46, v7 +; SI-NEXT: v_or_b32_e32 v7, v38, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v8, v44, v8 +; SI-NEXT: v_or_b32_e32 v8, v55, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 @@ -122860,15 +122359,14 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 ; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 @@ -122876,12 +122374,12 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 @@ -122889,12 +122387,12 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 @@ -122902,12 +122400,12 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 @@ -122915,12 +122413,14 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v13 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 @@ -122952,7 +122452,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -122961,7 +122461,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v16 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -122969,7 +122469,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -122978,7 +122478,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v17 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -122986,7 +122486,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -122995,7 +122495,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v18 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -123003,7 +122503,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -123012,7 +122512,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v19 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -123020,7 +122520,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v20 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -123029,7 +122529,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v20 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -123037,7 +122537,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v21 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -123046,7 +122546,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v21 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -123054,7 +122554,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v22 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -123063,15 +122563,15 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v22 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v23 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -123080,32 +122580,30 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v25 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -123114,15 +122612,15 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -123131,15 +122629,15 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v26 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v27 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -123148,15 +122646,15 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -123165,15 +122663,15 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v28 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v29 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -123182,15 +122680,15 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 ; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -123199,19 +122697,21 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 ; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; SI-NEXT: v_or_b32_e32 v31, v43, v31 ; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 -; SI-NEXT: v_add_i32_e32 v31, vcc, 0x300, v31 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 ; SI-NEXT: v_or_b32_e32 v32, v33, v32 ; SI-NEXT: v_or_b32_e32 v31, v32, v31 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 @@ -123288,16 +122788,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v3 ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v9 @@ -123333,43 +122833,42 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v41 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v43 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill @@ -123379,13 +122878,13 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 @@ -123395,11 +122894,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 @@ -123408,7 +122907,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill @@ -123418,7 +122917,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill @@ -123432,19 +122931,19 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill @@ -123458,25 +122957,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -123484,25 +122983,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -123510,25 +123009,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -123536,15 +123035,15 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 @@ -123553,88 +123052,88 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB74_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v14, v44, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v62, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: v_or_b32_sdwa v15, v41, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -123658,84 +123157,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr61 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -123749,41 +123180,41 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v43, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -123801,11 +123232,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -123818,17 +123249,17 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -123837,289 +123268,359 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; kill: killed $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: .LBB74_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB74_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v31, 0x300 +; VI-NEXT: v_add_u16_e32 v9, 3, v62 ; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v2, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 -; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v4, v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v5, v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 ; VI-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 -; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -124151,11 +123652,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: v_add_u16_e32 v8, 3, v62 +; VI-NEXT: v_add_u16_e32 v8, 3, v63 ; VI-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 ; VI-NEXT: v_or_b32_e32 v8, v8, v9 -; VI-NEXT: v_add_u16_e32 v9, 3, v32 +; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v10, 3, v60 ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -124164,9 +123665,8 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v10, v10, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v9, v9, v10 ; VI-NEXT: v_add_u16_e32 v10, 3, v58 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v11, 3, v56 ; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -124188,7 +123688,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v13, 3, v46 ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 ; VI-NEXT: v_add_u16_sdwa v13, v13, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v12, v12, v13 @@ -124196,39 +123696,38 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 3, v44 -; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 ; VI-NEXT: v_add_u16_sdwa v14, v14, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v13, v13, v14 -; VI-NEXT: v_add_u16_e32 v14, 3, v42 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v14, 3, v43 ; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v15, 3, v40 +; VI-NEXT: v_add_u16_e32 v15, 3, v41 ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_add_u16_sdwa v15, v15, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v14, v14, v15 ; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v15, 3, v15 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v15, v15, v16 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 @@ -124236,12 +123735,12 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v16, v16, v17 ; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v18, 3, v18 @@ -124274,7 +123773,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v19, v19, v20 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v20, 3, v20 @@ -124293,14 +123792,14 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v21, 3, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v21, 0x300, v21 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 ; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v22, v22, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v21, v21, v22 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v22, 3, v22 @@ -124327,46 +123826,46 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v24, v24, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v23, v23, v24 ; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v24, 3, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 ; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v24, v24, v25 -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v25, 3, v25 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v26, 3, v26 ; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v26, v26, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v25, v25, v26 -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v26, 3, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 ; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v26, v26, v27 -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v27, 3, v27 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -124378,21 +123877,21 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v28, v28, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v27, v27, v28 -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v28, 3, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v29, v29, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v28, v28, v29 -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v29, 3, v29 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -124405,7 +123904,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v29, v29, v30 ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v30, 3, v30 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -124417,7 +123916,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v32, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v30, v30, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v32, 3, v32 @@ -124427,7 +123926,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v33, 3, v33 -; VI-NEXT: v_or_b32_sdwa v33, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v31, v32, v31 ; VI-NEXT: .LBB74_4: ; %end @@ -124504,16 +124003,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:176 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:184 -; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v3 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v9 @@ -124559,47 +124058,45 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v32 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v43 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v42 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill @@ -124609,13 +124106,13 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: s_nop 0 @@ -124626,11 +124123,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 ; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 @@ -124639,7 +124136,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill @@ -124651,7 +124148,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill @@ -124666,19 +124163,19 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill @@ -124693,25 +124190,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -124720,25 +124217,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 @@ -124747,25 +124244,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 @@ -124774,106 +124271,105 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB74_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v62, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v57, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_or_b32_sdwa v15, v41, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -124897,84 +124393,16 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v62, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v32, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -124988,41 +124416,41 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v43, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -125040,11 +124468,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -125057,17 +124485,17 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -125076,296 +124504,363 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; kill: killed $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: .LBB74_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB74_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v62 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(27) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(27) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(21) ; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 ; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -125397,11 +124892,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v62 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 ; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v32 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v10, 3, v60 ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -125410,7 +124905,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v11, 3, v56 @@ -125433,7 +124928,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v13, 3, v46 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 ; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 @@ -125441,39 +124936,38 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v14, 3, v43 ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v41 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 @@ -125481,12 +124975,12 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 @@ -125519,7 +125013,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 @@ -125538,14 +125032,14 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 ; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 @@ -125572,46 +125066,46 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 ; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 ; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v25 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 ; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -125623,21 +125117,21 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -125650,7 +125144,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -125662,7 +125156,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 @@ -125672,7 +125166,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v32, v63, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 ; GFX9-NEXT: .LBB74_4: ; %end @@ -127489,254 +126983,244 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:152 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v55, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v25 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v21 -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v17 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v25 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v45 -; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v44 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v43 -; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v42 -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v28 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v54 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v53 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v0 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:204 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -127744,920 +127228,933 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB75_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v0, v0, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v59, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v55, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 ; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v2, v2, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v61 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: v_mov_b32_e32 v2, v9 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v47, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v26, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v19 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v1, v8, v1 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v23 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v29, v1 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v0, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 -; SI-NEXT: v_or_b32_e32 v0, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v43 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_or_b32_e32 v0, v0, v32 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mov_b32_e32 v50, v16 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v16, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: v_mov_b32_e32 v48, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v17, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v18, v1 ; SI-NEXT: v_or_b32_e32 v18, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v57, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v58, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v19, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 -; SI-NEXT: v_mov_b32_e32 v54, v23 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v20, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v20, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v57, v3 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v21, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v22, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v45, v24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v34, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v23, v1 -; SI-NEXT: v_or_b32_e32 v23, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v23, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v61 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v24, v1 -; SI-NEXT: v_or_b32_e32 v24, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v25, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 -; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v49 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v26, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v37, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v26, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v52, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_or_b32_e32 v27, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v27, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_mov_b32_e32 v34, v41 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v46, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v28, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_or_b32_e32 v29, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v0, v0, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v30, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v31, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s20, 0xff -; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s22, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s24, 0xff -; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s26, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_branch .LBB75_3 ; SI-NEXT: .LBB75_2: -; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v43 +; SI-NEXT: v_mov_b32_e32 v36, v61 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v34, v41 +; SI-NEXT: v_mov_b32_e32 v33, v60 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB75_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mov_b32_e32 v35, v57 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) expcnt(3) +; SI-NEXT: v_mov_b32_e32 v59, v62 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB75_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_and_b32 s8, s26, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v0, s4, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, s7, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v6, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v7, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v8, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v9, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v10, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v10 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v11, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v11 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v12, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v12 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v13, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v13 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v62, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v14, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v60, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v50, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v16, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v59, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v17, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v18, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v18 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_or_b32_e32 v3, v58, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v19, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v2 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v19 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v57, v4 +; SI-NEXT: v_or_b32_e32 v20, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v21, v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v22, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v21 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v22, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v48, v1 +; SI-NEXT: v_or_b32_e32 v1, v38, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v46, v1 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -128665,14 +128162,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -128680,20 +128177,19 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v0 @@ -128739,115 +128235,114 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v23 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 @@ -128858,26 +128353,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v11 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v7 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -128887,807 +128378,814 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:92 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:116 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB75_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff ; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_and_b32 s4, s20, 0xff +; VI-NEXT: s_lshl_b32 s5, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff ; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s24, 0xff +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s8, s4, s5 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v47, v1 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v46, v0 -; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v49, v0 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v62, v0 -; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v63, v1 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v1 +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_mov_b32_e32 v60, v0 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v35, v0 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v44, v0 +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v34, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v59, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v63, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v35, v51 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v61 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v55, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v37, v53 ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v54, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v53, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v41, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v33 +; VI-NEXT: v_mov_b32_e32 v38, v54 +; VI-NEXT: v_or_b32_sdwa v0, v54, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v48, v63 +; VI-NEXT: v_mov_b32_e32 v46, v33 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v51, v34 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v61, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v56 -; VI-NEXT: v_or_b32_sdwa v0, v56, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v38, v39 -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v53 -; VI-NEXT: v_mov_b32_e32 v52, v36 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v0, v36, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v61, v60 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v1, v33, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v34, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v36, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v62, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v59, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v50, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v58, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v49, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v51, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v57, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v56, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v48, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v39, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v50, v40 -; VI-NEXT: v_mov_b32_e32 v49, v51 -; VI-NEXT: v_mov_b32_e32 v40, v34 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v36, v62 +; VI-NEXT: v_mov_b32_e32 v59, v58 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v62, v32 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 -; VI-NEXT: s_and_b32 s4, s16, 0xff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff -; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_branch .LBB75_3 ; VI-NEXT: .LBB75_2: -; VI-NEXT: v_mov_b32_e32 v44, v56 -; VI-NEXT: v_mov_b32_e32 v41, v33 -; VI-NEXT: v_mov_b32_e32 v50, v40 -; VI-NEXT: v_mov_b32_e32 v38, v39 -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v54, v53 -; VI-NEXT: v_mov_b32_e32 v52, v36 -; VI-NEXT: v_mov_b32_e32 v49, v51 +; VI-NEXT: v_mov_b32_e32 v48, v63 +; VI-NEXT: v_mov_b32_e32 v37, v53 +; VI-NEXT: v_mov_b32_e32 v35, v51 +; VI-NEXT: v_mov_b32_e32 v38, v54 +; VI-NEXT: v_mov_b32_e32 v61, v60 +; VI-NEXT: v_mov_b32_e32 v36, v62 +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v46, v33 +; VI-NEXT: v_mov_b32_e32 v50, v32 +; VI-NEXT: v_mov_b32_e32 v59, v58 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB75_3: ; %Flow -; VI-NEXT: v_mov_b32_e32 v51, v41 -; VI-NEXT: v_mov_b32_e32 v36, v44 -; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v54, v60 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v52, v59 +; VI-NEXT: v_mov_b32_e32 v58, v36 +; VI-NEXT: v_mov_b32_e32 v59, v38 ; VI-NEXT: s_cbranch_vccnz .LBB75_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 -; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 ; VI-NEXT: s_add_i32 s16, s16, 3 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s20, s20, 3 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 +; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: s_add_i32 s24, s24, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 +; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_mov_b32_e32 v33, v35 +; VI-NEXT: v_mov_b32_e32 v35, v37 +; VI-NEXT: v_mov_b32_e32 v37, v48 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v0, s7, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x300, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v16 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v17 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v41, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v19, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x300, v3 +; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v2 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v21, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v21 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v46 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v59 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v61 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v58 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v56 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v47 +; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v45 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -129732,128 +129230,130 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:80 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:144 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:176 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v41 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v55 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v45 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v20 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v28 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v30 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 @@ -129863,7 +129363,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; GFX9-NEXT: v_lshlrev_b32_e32 v41, 8, v41 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 @@ -129875,16 +129374,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v7 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -129895,423 +129393,410 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v13 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v5 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 -; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v9 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 -; GFX9-NEXT: s_waitcnt vmcnt(29) -; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:252 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB75_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s29, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s21, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s7, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s8, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s29, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v59, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_mov_b32_e32 v61, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_mov_b32_e32 v37, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v47, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v61, v52 +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v35, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v35, v62 -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v40, v30 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v36, v31 -; GFX9-NEXT: v_mov_b32_e32 v45, v62 -; GFX9-NEXT: v_mov_b32_e32 v46, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v53 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v46, v42 +; GFX9-NEXT: v_mov_b32_e32 v53, v51 +; GFX9-NEXT: v_mov_b32_e32 v50, v33 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 +; GFX9-NEXT: v_mov_b32_e32 v42, v36 +; GFX9-NEXT: v_mov_b32_e32 v51, v32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 -; GFX9-NEXT: s_and_b32 s4, s16, 0xff -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s18, 0xff -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s22, 0xff -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_branch .LBB75_3 ; GFX9-NEXT: .LBB75_2: -; GFX9-NEXT: v_mov_b32_e32 v33, v43 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v35, v62 -; GFX9-NEXT: v_mov_b32_e32 v36, v31 -; GFX9-NEXT: v_mov_b32_e32 v40, v30 +; GFX9-NEXT: v_mov_b32_e32 v61, v52 +; GFX9-NEXT: v_mov_b32_e32 v46, v42 +; GFX9-NEXT: v_mov_b32_e32 v53, v51 +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v50, v33 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB75_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v62, v35 -; GFX9-NEXT: v_mov_b32_e32 v35, v38 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB75_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -130355,160 +129840,163 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_and_b32 s8, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s29, 8 ; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 ; GFX9-NEXT: s_movk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, v53 +; GFX9-NEXT: v_mov_b32_e32 v54, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v50 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 @@ -130516,40 +130004,41 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 @@ -130557,153 +130046,159 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 3, v53 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v46 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v62 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v58 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v57 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v56 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v47 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -130949,309 +130444,241 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB75_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v91 -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v49 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 -; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v76 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v77 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v78 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v79 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v35 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v88 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v63 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v72 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v62 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v73 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v74 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v75 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v47 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v56 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v45 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v58 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v59 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v182 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v56 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v40 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v40 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v41 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v181 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v165 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v43 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v44 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v163 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v167 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v16, v176 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v164 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v148 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v177 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v146 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v178 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v179 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v18, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v147 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v118 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v150 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v151 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v103 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v151 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v160 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v20, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v112 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v101 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v132 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v112 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v132 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v133 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v134 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v135 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v100 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v84 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v102 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v144 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v144 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v119 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v128 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v82 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v25, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v130 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v24, v131 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v84 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v130 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v131 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v2, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v80 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v69 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v114 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v28, v87 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v116 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v27, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v94, 0xff, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v81 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v116 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v69 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v2, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v4, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v94, v89 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v98 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v99 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v30, v31 +; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v65 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff -; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 -; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff -; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16 -; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v2, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB75_3 @@ -131890,309 +131317,241 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB75_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s8, 0xffff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91 -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v49 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49 -; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v76 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v6, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v78 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v7, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v38 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v35 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v88 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v8, v63 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v33 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v9, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v62 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v74 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v10, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v32 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v47 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v45 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v12, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v182 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v13, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v180 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v41 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v14, v42 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v181 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v165 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v15, v44 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v163 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v167 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v16, v176 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v148 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v177 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v17, v178 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v146 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v179 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v18, v149 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v118 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v150 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v19, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v145 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v103 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v20, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v101 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v132 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v21, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v86 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v134 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v22, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v84 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v144 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v23, v119 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v81 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v4, v128 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v24, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v82 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v25, v113 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v130 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v24, v131 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v2, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v27 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v70 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v69 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v114 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v28, v87 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v27, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v94, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v26 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v2, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v4, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v94, v89 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v99 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v30, v31 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff ; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff -; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16 -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v2, v3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB75_3 @@ -132948,6 +132307,8 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 @@ -132958,8 +132319,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 @@ -134172,6 +133531,7 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 @@ -134182,7 +133542,6 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB77_4 @@ -134236,6 +133595,7 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 @@ -134246,7 +133606,6 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB77_4 @@ -134773,18 +134132,18 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -137336,650 +136695,655 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: v_mov_b32_e32 v43, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v58, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v30, v28 +; SI-NEXT: v_mov_b32_e32 v28, v27 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v41, v23 -; SI-NEXT: v_mov_b32_e32 v29, v20 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v44, v20 +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v59, v29 +; SI-NEXT: v_mov_b32_e32 v60, v24 +; SI-NEXT: v_mov_b32_e32 v41, v22 +; SI-NEXT: v_mov_b32_e32 v22, v13 +; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v30 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s28 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v37 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s27 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v45, 1.0, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v57 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 ; SI-NEXT: s_cbranch_scc0 .LBB79_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_mov_b32_e32 v47, v3 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v0, v19 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v57 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v23 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 -; SI-NEXT: v_mov_b32_e32 v1, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 -; SI-NEXT: v_mov_b32_e32 v2, v14 -; SI-NEXT: v_mov_b32_e32 v49, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[2:3], v[14:15], 16 -; SI-NEXT: v_mov_b32_e32 v3, v16 -; SI-NEXT: v_mov_b32_e32 v20, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[2:3], v[10:11], 16 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v63 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v4, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[3:4], v[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[4:5], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v5, v29 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[4:5], v[6:7], 16 +; SI-NEXT: v_mov_b32_e32 v5, v9 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 -; SI-NEXT: v_mov_b32_e32 v6, v45 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[5:6], v[9:10], 16 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v57 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[6:7], v[45:46], 16 -; SI-NEXT: v_mov_b32_e32 v7, v39 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[6:7], v[8:9], 16 +; SI-NEXT: v_mov_b32_e32 v7, v40 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[7:8], v[39:40], 16 -; SI-NEXT: v_mov_b32_e32 v8, v9 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[7:8], v[40:41], 16 +; SI-NEXT: v_mov_b32_e32 v8, v24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v9 +; SI-NEXT: v_lshr_b64 v[8:9], v[24:25], 16 ; SI-NEXT: v_mov_b32_e32 v9, v54 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[9:10], v[54:55], 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; SI-NEXT: v_mov_b32_e32 v10, v11 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[10:11], v[11:12], 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v11 -; SI-NEXT: v_mov_b32_e32 v11, v56 +; SI-NEXT: v_mov_b32_e32 v10, v53 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v27 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[11:12], v[56:57], 16 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 -; SI-NEXT: v_mov_b32_e32 v13, v58 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[10:11], v[53:54], 16 +; SI-NEXT: v_mov_b32_e32 v11, v52 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[13:14], v[58:59], 16 -; SI-NEXT: v_mov_b32_e32 v14, v60 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[11:12], v[52:53], 16 +; SI-NEXT: v_mov_b32_e32 v12, v51 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[14:15], v[60:61], 16 -; SI-NEXT: v_mov_b32_e32 v15, v62 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[12:13], v[51:52], 16 +; SI-NEXT: v_mov_b32_e32 v13, v39 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[15:16], v[62:63], 16 -; SI-NEXT: v_mov_b32_e32 v16, v32 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[39:40], 16 +; SI-NEXT: v_mov_b32_e32 v14, v32 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; SI-NEXT: v_mov_b32_e32 v40, v17 -; SI-NEXT: v_lshr_b64 v[17:18], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[18:19], v[47:48], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 16 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_mov_b32_e32 v21, v22 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[21:22], v[22:23], 16 -; SI-NEXT: v_mov_b32_e32 v22, v31 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[22:23], v[31:32], 16 -; SI-NEXT: v_mov_b32_e32 v23, v24 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[23:24], v[24:25], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 -; SI-NEXT: v_mov_b32_e32 v24, v41 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[14:15], v[32:33], 16 +; SI-NEXT: v_mov_b32_e32 v15, v16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v33, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_mov_b32_e32 v53, v58 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v41, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[15:16], v[16:17], 16 +; SI-NEXT: v_mov_b32_e32 v16, v18 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[16:17], v[18:19], 16 +; SI-NEXT: v_mov_b32_e32 v17, v62 +; SI-NEXT: v_mov_b32_e32 v19, v63 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v21 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[24:25], v[41:42], 16 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v41, v26 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v50 -; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_lshr_b64 v[17:18], v[62:63], 16 +; SI-NEXT: v_mov_b32_e32 v18, v44 +; SI-NEXT: v_mov_b32_e32 v63, v19 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[18:19], v[44:45], 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: v_mov_b32_e32 v44, v19 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_mov_b32_e32 v19, v61 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[19:20], v[61:62], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v56 +; SI-NEXT: v_mov_b32_e32 v56, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 +; SI-NEXT: v_mov_b32_e32 v20, v60 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[60:61], 16 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v42 +; SI-NEXT: v_mov_b32_e32 v42, v59 +; SI-NEXT: v_lshr_b64 v[21:22], v[59:60], 16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v47 +; SI-NEXT: v_lshr_b64 v[22:23], v[58:59], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v58, v36 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v43 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: v_mov_b32_e32 v60, v47 +; SI-NEXT: v_mov_b32_e32 v47, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_mov_b32_e32 v23, v31 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[23:24], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v32, v35 ; SI-NEXT: v_lshr_b64 v[25:26], v[26:27], 16 -; SI-NEXT: v_mov_b32_e32 v26, v43 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v46 +; SI-NEXT: v_lshr_b64 v[26:27], v[34:35], 16 +; SI-NEXT: v_mov_b32_e32 v27, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[26:27], v[43:44], 16 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 ; SI-NEXT: v_lshr_b64 v[27:28], v[28:29], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[51:52], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[53:54], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v57 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; SI-NEXT: v_lshr_b64 v[28:29], v[50:51], 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_mov_b32_e32 v29, v30 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v57 -; SI-NEXT: v_mov_b32_e32 v53, v31 -; SI-NEXT: v_lshr_b64 v[31:32], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[30:31], 16 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v49 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshr_b64 v[34:35], v[49:50], 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshr_b64 v[30:31], v[38:39], 16 +; SI-NEXT: v_mov_b32_e32 v31, v34 ; SI-NEXT: s_branch .LBB79_3 ; SI-NEXT: .LBB79_2: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v44 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v55, v13 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v47 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: v_mov_b32_e32 v42, v59 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v53, v58 +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: v_mov_b32_e32 v47, v34 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v50 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_mov_b32_e32 v32, v35 +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v33, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v60, v3 +; SI-NEXT: v_mov_b32_e32 v58, v1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB79_3: ; %Flow -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v35, v56 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mov_b32_e32 v32, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v33, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v53 +; SI-NEXT: v_mov_b32_e32 v35, v38 +; SI-NEXT: v_mov_b32_e32 v38, v47 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v54, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v36, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v58, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v45, v61 ; SI-NEXT: s_cbranch_vccnz .LBB79_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v47 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshr_b64 v[11:12], v[11:12], 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshr_b64 v[15:16], v[15:16], 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshr_b64 v[16:17], v[16:17], 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v36 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshr_b64 v[19:20], v[19:20], 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 @@ -137988,83 +137352,72 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_lshr_b64 v[20:21], v[20:21], 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshr_b64 v[21:22], v[21:22], 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshr_b64 v[23:24], v[23:24], 16 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_lshr_b64 v[24:25], v[24:25], 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshr_b64 v[25:26], v[25:26], 16 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v43 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshr_b64 v[32:33], v[32:33], 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_lshr_b64 v[27:28], v[27:28], 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshr_b64 v[28:29], v[28:29], 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshr_b64 v[29:30], v[29:30], 16 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshr_b64 v[30:31], v[30:31], 16 ; SI-NEXT: v_mov_b32_e32 v31, v32 ; SI-NEXT: .LBB79_5: ; %end @@ -138094,36 +137447,36 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 ; VI-NEXT: v_mov_b32_e32 v29, v15 -; VI-NEXT: v_mov_b32_e32 v27, v13 -; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v23, v9 -; VI-NEXT: v_mov_b32_e32 v21, v7 -; VI-NEXT: v_mov_b32_e32 v19, v5 -; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 ; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 +; VI-NEXT: v_mov_b32_e32 v19, v5 ; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 ; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v3, s19 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: v_mov_b32_e32 v7, s23 -; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill @@ -139473,8 +138826,8 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, s28 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v173, v0 :: v_dual_mov_b32 v174, s29 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB79_4 @@ -139497,769 +138850,655 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xffff0000 ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v5, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s25, 16 -; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 ; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s25, 16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v9, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v1.l -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.h, v0.l +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s18, 16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s4 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v1.l +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v120.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s3 ; GFX11-TRUE16-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 ; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v5, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff0000 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v9, 16, 1 ; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 -; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v167 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e64 v4, 0x40c00000, s0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v177 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v167 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v1, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v176 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, v7, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v167 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v176 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v176 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 16, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v5, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v177 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v178 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v179 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v6 :: v_dual_add_f32 v3, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v177 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v178 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v178 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v181 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v180 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v4, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v178, 16, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v179 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v180 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v180 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v183 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v3, v5, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v181 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v182 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v1 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v3, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v181 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v182 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v182 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v169 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v5, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v183 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v168 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v1.l ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v183 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v168 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v168 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v171 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 16, v3 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v7 :: v_dual_add_nc_u32 v3, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v170 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v3, v6 :: v_dual_add_f32 v3, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v168, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v168.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v169 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v170 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v170 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v171 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v172 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v170 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v169 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v6 :: v_dual_add_nc_u32 v4, v7, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v169, 16, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v5, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v169.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v171 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v173 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v173 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v4, v6 :: v_dual_add_nc_u32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v170, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v170.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v6 :: v_dual_lshlrev_b32 v6, 16, v173 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v174 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, v2, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_add_f32 v5, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v174 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, v9, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v171, 16, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v171.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v172 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v172 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_add_f32 v4, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v173, 16, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v173.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v174 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, v1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v174 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v2, v7 :: v_dual_add_nc_u32 v7, v8, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v9, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v174, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v174.h, v1.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v172, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v172.h, v3.l ; GFX11-TRUE16-NEXT: .LBB79_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v19, v171 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v172 :: v_dual_mov_b32 v17, v174 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v23, v183 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 ; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 @@ -140368,101 +139607,174 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:260 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:252 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:308 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:220 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:184 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:124 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v120, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v121, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v122, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v123, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v124, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v125, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v126, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v127, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v136, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v137, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v138, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v139, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v140, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v141, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v142, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v143, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v152, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v153, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v154, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v155, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v156, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:56 +; GFX11-FAKE16-NEXT: s_clause 0xd ; 56-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v171, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v172, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v173, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v174, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v175, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v184, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v185, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v188, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v189, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v190, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v191, s32 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v190, v13 :: v_dual_mov_b32 v191, v12 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v11 :: v_dual_mov_b32 v67, v9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v179, v8 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v70, v7 :: v_dual_mov_b32 v183, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v189, v3 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v180, v4 :: v_dual_mov_b32 v185, v2 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v184, v1 :: v_dual_mov_b32 v69, s28 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v182, v0 :: v_dual_mov_b32 v181, s29 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB79_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v107, s16 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, s20 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v34, s1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v140, s2 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:1080 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:1096 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:1112 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1128 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1144 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1160 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1176 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1192 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v144, s3 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v114, s18 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v76, s17 :: v_dual_mov_b32 v159, s26 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v42, s19 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:952 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:968 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:984 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1000 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1016 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1032 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1048 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1064 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, s22 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:824 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:840 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:856 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:872 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:888 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:904 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:920 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:936 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s23 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:696 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:712 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:728 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:744 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:760 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:776 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:792 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:808 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, s24 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:632 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:648 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:664 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:680 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, s25 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:552 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s27 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:424 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB79_3 ; GFX11-FAKE16-NEXT: .LBB79_2: ; %cmp.true @@ -140470,762 +139782,937 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s27, 16 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v0, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s26, 16 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s6 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v3, 0x40c00000, s4 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 +; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v0 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s25, 0xffff0000 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s25, 16 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xffff0000 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s24, 16 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v10, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v9, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v8, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v3, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v10, v6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v9 :: v_dual_add_nc_u32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:424 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s23, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 ; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s24, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s23, 0xffff0000 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v159, v0, 16, v1 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v9 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 16 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v151, v0, 16, v1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 -; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v4, v8, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v11, v7 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v10, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s22, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v11, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v8 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v9, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s22, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:552 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s21, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v11, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, v14, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v13, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v16, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v16, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:632 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:648 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:664 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:680 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s20, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x7fff, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v12, v16 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v12, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s20, 16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v10, v14, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:696 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:712 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:728 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:744 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:760 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:776 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:792 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:808 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s19, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v18, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v19, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v17, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s19, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v13 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v14 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v16, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v18, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v21, v17, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x7fff, v14 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:824 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:840 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:856 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:872 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:888 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:904 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:920 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:936 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s18, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v18, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, v21, v17 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x7fff, v19 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v20, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s18, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x7fff, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v18 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v22, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:952 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:968 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:984 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1000 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1016 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1032 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1048 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1064 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s17, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, v17, v20 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x7fff, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, v19, v22 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v19, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s17, 16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x7fff, v18 -; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v19, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v17, v21, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[0:3], s32 offset:1080 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[4:7], s32 offset:1096 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[8:11], s32 offset:1112 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[12:15], s32 offset:1128 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[16:19], s32 offset:1144 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[20:23], s32 offset:1160 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[24:27], s32 offset:1176 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[28:31], s32 offset:1192 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s16, 16 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v69 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v24, v19 -; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v25, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v23, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s16, 16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v21 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, v22, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v24, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v23, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x7fff, v21 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v42, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v114, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, v27, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x7fff, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v26, 0x40c00000, s4 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x7fff, v20 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v28, 0x40c00000, s3 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-FAKE16-NEXT: v_bfe_u32 v25, v28, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s3 ; GFX11-FAKE16-NEXT: s_and_b32 s3, s2, 0xffff0000 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v22 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x7fff, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, v25, v28 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v25, 0x40c00000, s3 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 ; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x7fff, v24 -; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v25, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v31, 0x40c00000, s2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v27, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v7, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s3 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v107, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_and_b32 s2, s1, 0xffff0000 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v30, v25 -; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v31, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v29, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e64 v29, 0x40c00000, s2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v27 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, v28, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v30, 0x40c00000, s1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x7fff, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v8, 0x40c00000, s1 ; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff0000 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v28, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v33, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v26 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v29 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v32, 0x40c00000, s1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x7fff, v26 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_add_f32_e64 v34, 0x40c00000, s0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v31, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v26, v33, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, v29, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v178 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v30 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, v31, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v178 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x7fff, v28 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v109, v5, 16, v7 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x7fff, v30 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v28, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v31, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v33 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v179 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v179 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v37, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v34, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v33, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v180 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v6, 0x40c00000, s2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v144, v0, 16, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v33, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v178, v31, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v36, v37 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v182 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v32, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v136, v2, 16, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v37, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v181 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v181 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v31, 16, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, v35, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v170 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v36, v38 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v7, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v9, 0x40c00000, s0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e64 v5, 0x40c00000, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v140, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v31, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v38, v35 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v39, v36 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v169 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v169 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v32, 16, v33 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v176 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, v35, v37 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v35, 16, v176 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, v32, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v170, v33, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v49, v35 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v174 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v174 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, v4, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v9 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v190 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v190 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v1, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_add_nc_u32 v3, v5, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v191 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v7, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v191 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v4, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v32 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v190, v1, 16, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v169, v31, 16, v32 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v3, v5, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v68 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v191, v2, 16, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v37, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v67 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v7, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v68 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v6, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v67 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v31, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v171 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v177 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v31, v36 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v176, v33, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x7fff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v5, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v179 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v7, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v179 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v68, v1, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v8, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v9, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v70 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v8 :: v_dual_and_b32 v6, 0xffff0000, v70 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v2, 16, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v188 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, v5, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v188 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v179, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v6, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v7, v11, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v183 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_add_f32 v5, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v183 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v3, v3, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v70, v1, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v7, v5 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v180 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v189 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v1, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v188, v3, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v7, 16, v180 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v177 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_add_nc_u32 v6, v7, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v189 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_add_nc_u32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v36, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x7fff, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v50, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v184 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v12, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v185 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v10 :: v_dual_add_nc_u32 v5, v11, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v185 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x7fff, v35 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v12 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v10 :: v_dual_cndmask_b32 v5, v5, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v7, 16, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v174, v33, 16, v31 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v171, v32, 16, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, v48, v37 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v175 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v175 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, v39, v38 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v177, v35, 16, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x7fff, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x7fff, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v180, v2, 16, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, v10, v7 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v184 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v184 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v8 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v189, v5, 16, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x7fff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v33, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v35, vcc_lo -; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v173 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v173 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v33 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v37, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, v39, v34 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v182 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v2, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v7, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, v9, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v35, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v122, v3, 16, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x7fff, v38 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v48 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v5 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v37, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v172 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v172 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, v36, v38 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v48 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v36, 0x7fff, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v39, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v39 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, v38, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v6, v11 :: v_dual_lshlrev_b32 v10, 16, v69 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v8, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v181 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v181 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, v6, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v13 :: v_dual_add_nc_u32 v11, v12, v7 +; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v9 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v49, 0x7fff, v49 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, v51, v48 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x7fff, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v52, v52, v50 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x7fff, v11 +; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v10 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x7fff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, v14, v12 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x7fff, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v32, 16, v31 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v175, v33, 16, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v49, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v173, v35, 16, v36 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v97, v8, 16, v10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff, v48 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v86, v9, 16, v12 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v76, v11, 16, v13 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v67, v14, 16, v17 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v172, v37, 16, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v59, v16, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v52, v18, 16, v20 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v46, v21, 16, v23 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v41, v22, 16, v25 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v183, v39, 16, v48 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v37, v24, 16, v27 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v34, v26, 16, v28 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v32, v29, 16, v30 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v16 :: v_dual_add_nc_u32 v13, 0x7fff, v13 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x7fff, v14 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v13, v17, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v185, v2, 16, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v184, v3, 16, v4 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v11, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v182, v5, 16, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v14, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v181, v7, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v69, v9, 16, v10 ; GFX11-FAKE16-NEXT: .LBB79_3: ; %end -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:100 -; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:104 -; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:108 -; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:112 -; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 -; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 -; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v34 :: v_dual_mov_b32 v2, v140 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v42 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:1080 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:1096 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:1112 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:1128 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:1144 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:1160 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:1176 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:1192 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v144 :: v_dual_mov_b32 v4, v107 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v76 :: v_dual_mov_b32 v6, v114 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v19, v184 :: v_dual_mov_b32 v20, v185 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v180 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v23, v183 :: v_dual_mov_b32 v24, v188 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v29, v32 :: v_dual_mov_b32 v30, v191 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v31, v190 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v17, v181 :: v_dual_mov_b32 v18, v182 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v25, v70 :: v_dual_mov_b32 v26, v179 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v27, v67 :: v_dual_mov_b32 v28, v68 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v14, v159 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v69 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, v41 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:952 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:968 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:984 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:1000 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:1016 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:1032 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:1048 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:1064 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v42 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:824 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:840 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:856 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:872 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:888 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:904 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:920 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:936 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v43 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:696 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:712 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:728 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:744 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:760 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:776 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:792 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:808 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, v44 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:568 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:584 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:600 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:616 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:632 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:648 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:664 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:680 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v45 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:440 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:456 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:472 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:488 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:504 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:520 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:536 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:552 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, v46 +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 224-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b128 v[33:36], off, s32 offset:312 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[37:40], off, s32 offset:328 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[41:44], off, s32 offset:344 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[45:48], off, s32 offset:360 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[49:52], off, s32 offset:376 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[53:56], off, s32 offset:392 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[57:60], off, s32 offset:408 +; GFX11-FAKE16-NEXT: scratch_load_b128 v[61:64], off, s32 offset:424 +; GFX11-FAKE16-NEXT: scratch_load_b32 v191, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v190, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v189, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v188, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v185, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v173, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v172, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v171, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v170, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v169, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v168, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v159, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v158, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v157, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v156, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v155, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v154, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v153, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v152, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v143, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v142, off, s32 offset:92 ; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 -; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 -; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 -; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:140 -; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:144 -; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:148 -; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:152 -; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:156 -; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:160 -; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:164 -; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:168 -; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:172 -; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:176 -; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:180 -; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:184 -; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:188 -; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:192 -; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:196 -; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:200 -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:204 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:208 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:212 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:216 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:220 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:224 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:228 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:232 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:236 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:240 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:268 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:272 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:276 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:280 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:284 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:288 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v28, v182 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178 +; GFX11-FAKE16-NEXT: scratch_load_b32 v141, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_b32 v140, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v139, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:128 +; GFX11-FAKE16-NEXT: scratch_load_b32 v124, off, s32 offset:132 +; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:136 +; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:140 +; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:144 +; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:148 +; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:152 +; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:156 +; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:160 +; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:164 +; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:168 +; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:172 +; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:176 +; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:180 +; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:184 +; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:188 +; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:192 +; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:196 +; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:200 +; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:204 +; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:208 +; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:212 +; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:216 +; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:220 +; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:224 +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:228 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:232 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:236 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:240 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:244 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:248 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:252 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:256 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:260 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:264 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:268 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:272 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:276 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:280 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:284 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:288 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:292 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:296 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:300 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:304 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:308 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, v48 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-FAKE16-NEXT: .LBB79_4: -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; kill: killed $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176 ; GFX11-FAKE16-NEXT: s_branch .LBB79_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -141251,71 +140738,70 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -141332,37 +140818,38 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -141370,150 +140857,154 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB80_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 -; SI-NEXT: v_mov_b32_e32 v48, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v49 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 -; SI-NEXT: v_mov_b32_e32 v50, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v54, v7 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v6 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v4 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v3 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v62, v2 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v39 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v53, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v40, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v36 -; SI-NEXT: v_mov_b32_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v35 -; SI-NEXT: v_mov_b32_e32 v35, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v33 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 @@ -141523,92 +141014,107 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: .LBB80_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB80_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[53:54], v[9:10], 1.0 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v26 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v25 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[33:34], v[5:6], 1.0 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v56, v33 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_add_f64 v[35:36], v[11:12], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v23 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[41:42], v[7:8], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v32 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 @@ -141628,252 +141134,241 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: v_mov_b32_e32 v38, v32 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: .LBB80_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -141883,8 +141378,8 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -141894,8 +141389,8 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -141905,8 +141400,8 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -141916,8 +141411,8 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -141926,58 +141421,56 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -142124,21 +141617,21 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_mov_b32_e32 v19, s16 -; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_mov_b32_e32 v21, s18 -; SI-NEXT: v_mov_b32_e32 v22, s19 +; SI-NEXT: v_mov_b32_e32 v31, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v32, s17 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v30, s19 ; SI-NEXT: v_mov_b32_e32 v27, s20 ; SI-NEXT: v_mov_b32_e32 v28, s21 -; SI-NEXT: v_mov_b32_e32 v31, s22 -; SI-NEXT: v_mov_b32_e32 v32, s23 -; SI-NEXT: v_mov_b32_e32 v29, s24 -; SI-NEXT: v_mov_b32_e32 v30, s25 -; SI-NEXT: v_mov_b32_e32 v25, s26 -; SI-NEXT: v_mov_b32_e32 v26, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v23, s28 -; SI-NEXT: v_mov_b32_e32 v24, s29 +; SI-NEXT: v_mov_b32_e32 v25, s22 +; SI-NEXT: v_mov_b32_e32 v26, s23 +; SI-NEXT: v_mov_b32_e32 v23, s24 +; SI-NEXT: v_mov_b32_e32 v24, s25 +; SI-NEXT: v_mov_b32_e32 v21, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -142164,281 +141657,270 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v33 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v33 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v33 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v25 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v33 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v33 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v33 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v19 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v21 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 ; SI-NEXT: s_cbranch_execnz .LBB81_3 ; SI-NEXT: .LBB81_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f64 v[33:34], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v7 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_add_f64 v[53:54], v[21:22], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[1:2], v[21:22], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 -; SI-NEXT: v_add_f64 v[1:2], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_add_f64 v[33:34], v[25:26], 1.0 +; SI-NEXT: v_add_f64 v[41:42], v[23:24], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -142447,48 +141929,54 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v6 -; SI-NEXT: v_mov_b32_e32 v48, v16 -; SI-NEXT: v_mov_b32_e32 v38, v17 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 +; SI-NEXT: v_mov_b32_e32 v48, v18 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: .LBB81_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 @@ -142496,15 +141984,15 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 @@ -142517,7 +142005,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -142529,52 +142017,44 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -142583,7 +142063,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -142592,7 +142072,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -142602,8 +142082,8 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -142613,8 +142093,8 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -142624,8 +142104,8 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -142635,8 +142115,8 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -142646,8 +142126,8 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -142657,8 +142137,8 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -142668,8 +142148,8 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -142679,8 +142159,8 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -142690,8 +142170,8 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -142701,8 +142181,8 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -142711,69 +142191,77 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -142796,103 +142284,103 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB81_4: -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: s_branch .LBB81_2 ; ; VI-LABEL: bitcast_v16f64_to_v64f16_scalar: @@ -142919,6 +142407,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 @@ -142929,7 +142418,6 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB81_4 @@ -142983,6 +142471,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 @@ -142993,7 +142482,6 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB81_4 @@ -143531,218 +143019,210 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB82_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v51 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v48 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v39 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v37 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 @@ -143750,27 +143230,37 @@ define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload @@ -144123,22 +143613,23 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v28 ; SI-NEXT: v_mov_b32_e32 v53, v26 -; SI-NEXT: v_mov_b32_e32 v45, v6 +; SI-NEXT: v_mov_b32_e32 v41, v6 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) @@ -144150,40 +143641,40 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 ; SI-NEXT: v_mov_b32_e32 v54, v14 ; SI-NEXT: v_mov_b32_e32 v55, v12 -; SI-NEXT: v_mov_b32_e32 v41, v11 -; SI-NEXT: v_mov_b32_e32 v40, v10 -; SI-NEXT: v_mov_b32_e32 v44, v9 -; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: v_mov_b32_e32 v45, v9 +; SI-NEXT: v_mov_b32_e32 v44, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 @@ -144194,25 +143685,25 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v48 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v25, v39 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v28 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v31 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f16_f32_e32 v38, v60 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v42 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v37, v62 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -144224,68 +143715,77 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v63, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB83_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v36, v54 +; SI-NEXT: v_or_b32_e32 v19, v54, v19 +; SI-NEXT: v_mov_b32_e32 v54, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v4, v34, v4 +; SI-NEXT: v_mov_b32_e32 v34, v53 +; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_mov_b32_e32 v53, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_or_b32_e32 v21, v52, v21 +; SI-NEXT: v_mov_b32_e32 v52, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v51, v22 ; SI-NEXT: v_mov_b32_e32 v51, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 @@ -144307,82 +143807,76 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v27, v38, v27 ; SI-NEXT: v_mov_b32_e32 v38, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v28, v37, v28 ; SI-NEXT: v_mov_b32_e32 v37, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v8, v11, v8 ; SI-NEXT: v_or_b32_e32 v9, v14, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_or_b32_e32 v19, v54, v19 -; SI-NEXT: v_mov_b32_e32 v54, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 -; SI-NEXT: v_or_b32_e32 v2, v60, v2 -; SI-NEXT: v_or_b32_e32 v3, v42, v3 -; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_mov_b32_e32 v63, v44 -; SI-NEXT: v_or_b32_e32 v11, v44, v11 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v35 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_or_b32_e32 v11, v46, v11 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_or_b32_e32 v12, v61, v12 ; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_or_b32_e32 v12, v59, v12 +; SI-NEXT: v_or_b32_e32 v13, v59, v13 ; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_or_b32_e32 v13, v57, v13 ; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_or_b32_e32 v14, v47, v14 ; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_or_b32_e32 v14, v45, v14 +; SI-NEXT: v_or_b32_e32 v15, v45, v15 ; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v15, v43, v15 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_or_b32_e32 v16, v41, v16 -; SI-NEXT: v_or_b32_e32 v17, v40, v17 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_or_b32_e32 v18, v55, v18 -; SI-NEXT: v_or_b32_e32 v20, v53, v20 +; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_mov_b32_e32 v42, v55 +; SI-NEXT: v_or_b32_e32 v17, v55, v17 +; SI-NEXT: v_mov_b32_e32 v33, v40 +; SI-NEXT: v_or_b32_e32 v18, v40, v18 ; SI-NEXT: v_or_b32_e32 v30, v32, v30 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_or_b32_e32 v31, v36, v31 +; SI-NEXT: v_mov_b32_e32 v32, v35 +; SI-NEXT: v_or_b32_e32 v31, v63, v31 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB83_3 ; SI-NEXT: .LBB83_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v62, v61 ; SI-NEXT: v_mov_b32_e32 v60, v59 ; SI-NEXT: v_mov_b32_e32 v58, v57 ; SI-NEXT: v_mov_b32_e32 v56, v47 ; SI-NEXT: v_mov_b32_e32 v46, v45 ; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v42, v55 +; SI-NEXT: v_mov_b32_e32 v33, v40 +; SI-NEXT: v_mov_b32_e32 v36, v54 ; SI-NEXT: v_mov_b32_e32 v54, v20 -; SI-NEXT: v_mov_b32_e32 v33, v52 -; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v34, v53 +; SI-NEXT: v_mov_b32_e32 v32, v35 +; SI-NEXT: v_mov_b32_e32 v53, v21 +; SI-NEXT: v_mov_b32_e32 v52, v22 ; SI-NEXT: v_mov_b32_e32 v51, v23 ; SI-NEXT: v_mov_b32_e32 v50, v24 ; SI-NEXT: v_mov_b32_e32 v49, v25 @@ -144393,298 +143887,290 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB83_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v35 -; SI-NEXT: v_mov_b32_e32 v35, v40 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v40, v46 -; SI-NEXT: v_mov_b32_e32 v41, v56 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v43, v60 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v35, v63 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: v_mov_b32_e32 v59, v46 +; SI-NEXT: v_mov_b32_e32 v42, v56 +; SI-NEXT: v_mov_b32_e32 v43, v58 +; SI-NEXT: v_mov_b32_e32 v44, v60 +; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v46, v41 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB83_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v47 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_mov_b32_e32 v55, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v40 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v55 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v34 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v38 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v58 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v46 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v43 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v59 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v35 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v54 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v22, v20 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v50 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 @@ -144697,7 +144183,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v28, v26 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 @@ -144714,7 +144200,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 ; SI-NEXT: v_or_b32_e32 v28, v30, v28 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 @@ -144722,7 +144208,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v31, v29 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 @@ -144752,7 +144238,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64f16_to_v16f64_scalar: @@ -144983,252 +144469,214 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v160, v13 :: v_dual_mov_b32 v161, v12 +; GFX11-NEXT: v_dual_mov_b32 v162, v11 :: v_dual_mov_b32 v163, v10 +; GFX11-NEXT: v_dual_mov_b32 v164, v9 :: v_dual_mov_b32 v165, v8 +; GFX11-NEXT: v_dual_mov_b32 v166, v7 :: v_dual_mov_b32 v167, v6 +; GFX11-NEXT: v_dual_mov_b32 v176, v5 :: v_dual_mov_b32 v177, v4 +; GFX11-NEXT: v_dual_mov_b32 v178, v3 :: v_dual_mov_b32 v179, v2 +; GFX11-NEXT: v_dual_mov_b32 v180, v1 :: v_dual_mov_b32 v181, v0 +; GFX11-NEXT: v_dual_mov_b32 v182, s28 :: v_dual_mov_b32 v183, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:96 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 ; GFX11-NEXT: s_cbranch_scc0 .LBB83_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v27, s18 +; GFX11-NEXT: v_dual_mov_b32 v20, s17 :: v_dual_mov_b32 v35, s19 +; GFX11-NEXT: v_dual_mov_b32 v44, s20 :: v_dual_mov_b32 v65, s22 +; GFX11-NEXT: v_dual_mov_b32 v54, s21 :: v_dual_mov_b32 v77, s23 +; GFX11-NEXT: v_dual_mov_b32 v90, s24 :: v_dual_mov_b32 v119, s26 +; GFX11-NEXT: v_dual_mov_b32 v104, s25 :: v_dual_mov_b32 v135, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB83_3 ; GFX11-NEXT: .LBB83_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v30, 0x200, s27 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v135, 0x200, s27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v119, 0x200, s26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v104, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v90, 0x200, s24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v77, 0x200, s23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v65, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v54, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v44, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v35, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, s18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v160, 0x200, v160 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v161, 0x200, v161 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v162, 0x200, v162 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v163, 0x200, v163 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v164, 0x200, v164 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v165, 0x200, v165 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v166, 0x200, v166 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v167, 0x200, v167 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v176, 0x200, v176 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v177, 0x200, v177 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v178, 0x200, v178 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v179, 0x200, v179 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v180, 0x200, v180 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v181, 0x200, v181 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v183, 0x200, v183 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v170, 0x200, v170 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v171, 0x200, v171 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v172, 0x200, v172 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v173, 0x200, v173 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v174, 0x200, v174 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v175, 0x200, v175 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v184, 0x200, v184 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v151, 0x200, s25 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v137, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v124, 0x200, s23 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v112, 0x200, s22 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v101, 0x200, s21 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v91, 0x200, s20 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v82, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v74, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v67, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v61, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v56, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v49, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v47, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v182, 0x200, v182 op_sel_hi:[0,1] ; GFX11-NEXT: .LBB83_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 +; GFX11-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 +; GFX11-NEXT: v_mov_b32_e32 v13, v104 ; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:220 +; GFX11-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14 +; GFX11-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 +; GFX11-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65 +; GFX11-NEXT: v_mov_b32_e32 v14, v119 +; GFX11-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v182 +; GFX11-NEXT: v_dual_mov_b32 v17, v183 :: v_dual_mov_b32 v18, v181 +; GFX11-NEXT: v_dual_mov_b32 v19, v180 :: v_dual_mov_b32 v20, v179 +; GFX11-NEXT: v_dual_mov_b32 v21, v178 :: v_dual_mov_b32 v22, v177 +; GFX11-NEXT: v_dual_mov_b32 v23, v176 :: v_dual_mov_b32 v24, v167 +; GFX11-NEXT: v_dual_mov_b32 v25, v166 :: v_dual_mov_b32 v26, v165 +; GFX11-NEXT: v_dual_mov_b32 v27, v164 :: v_dual_mov_b32 v28, v163 +; GFX11-NEXT: v_dual_mov_b32 v29, v162 :: v_dual_mov_b32 v30, v161 +; GFX11-NEXT: v_mov_b32_e32 v31, v160 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB83_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 ; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109 +; GFX11-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122 +; GFX11-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136 +; GFX11-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151 ; GFX11-NEXT: s_branch .LBB83_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -145272,7 +144720,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr62 @@ -145281,7 +144729,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr51 @@ -145324,8 +144772,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v42, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 ; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v57, v4, v3, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 @@ -145338,7 +144785,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 @@ -145383,8 +144830,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v42, v10, v9, 16 ; SI-NEXT: v_alignbit_b32 v45, v8, v7, 16 ; SI-NEXT: v_alignbit_b32 v47, v6, v5, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v57, v4, v3, 16 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_alignbit_b32 v60, v2, v1, 16 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v32 @@ -145397,7 +144843,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 @@ -145409,9 +144855,9 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; SI-NEXT: .LBB84_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v60 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -145421,9 +144867,9 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -145476,7 +144922,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -145747,6 +145193,7 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; SI-NEXT: v_mov_b32_e32 v31, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v32, s17 ; SI-NEXT: v_mov_b32_e32 v29, s18 ; SI-NEXT: v_mov_b32_e32 v30, s19 @@ -145758,7 +145205,6 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v24, s25 ; SI-NEXT: v_mov_b32_e32 v21, s26 ; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v19, s28 ; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -145785,158 +145231,146 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 ; SI-NEXT: v_lshr_b64 v[34:35], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[5:6], 16 ; SI-NEXT: v_lshr_b64 v[35:36], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[3:4], 16 ; SI-NEXT: v_lshr_b64 v[36:37], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[1:2], 16 ; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[19:20], 16 ; SI-NEXT: v_lshr_b64 v[38:39], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[39:40], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[40:41], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[41:42], v[27:28], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[42:43], v[29:30], 16 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v32 -; SI-NEXT: v_lshr_b64 v[54:55], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 ; SI-NEXT: v_lshr_b64 v[52:53], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[43:44], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[39:40], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[29:30], 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v30 +; SI-NEXT: v_lshr_b64 v[41:42], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v32 ; SI-NEXT: s_cbranch_execnz .LBB85_3 ; SI-NEXT: .LBB85_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; SI-NEXT: v_lshr_b64 v[34:35], v[15:16], 16 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; SI-NEXT: v_lshr_b64 v[35:36], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[5:6], 16 ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; SI-NEXT: v_lshr_b64 v[36:37], v[11:12], 16 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_lshr_b64 v[38:39], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[3:4], 16 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_lshr_b64 v[39:40], v[23:24], 16 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[1:2], 16 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_lshr_b64 v[40:41], v[25:26], 16 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_lshr_b64 v[38:39], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[19:20], 16 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; SI-NEXT: v_lshr_b64 v[41:42], v[27:28], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[42:43], v[29:30], 16 -; SI-NEXT: v_lshr_b64 v[54:55], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 ; SI-NEXT: v_lshr_b64 v[52:53], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[43:44], v[31:32], 16 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v32 +; SI-NEXT: v_lshr_b64 v[39:40], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v32 ; SI-NEXT: .LBB85_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v43 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v50 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; SI-NEXT: v_or_b32_e32 v31, v31, v41 ; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v55 ; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v42 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v40 ; SI-NEXT: v_or_b32_e32 v29, v29, v31 ; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v44 ; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v41 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v39 ; SI-NEXT: v_or_b32_e32 v27, v27, v29 ; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v43 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v54 ; SI-NEXT: v_or_b32_e32 v25, v25, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v63 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v39 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v53 ; SI-NEXT: v_or_b32_e32 v23, v23, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v62 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen @@ -145948,55 +145382,55 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v61 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v60 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v50 ; SI-NEXT: v_or_b32_e32 v1, v1, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -146008,7 +145442,7 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -146018,11 +145452,9 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -146032,11 +145464,9 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -146046,11 +145476,9 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -146102,42 +145530,39 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB85_4: ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 ; SI-NEXT: s_branch .LBB85_2 ; ; VI-LABEL: bitcast_v16f64_to_v64i16_scalar: @@ -146164,6 +145589,7 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 ; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 @@ -146174,7 +145600,6 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB85_4 @@ -146228,6 +145653,7 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 @@ -146238,7 +145664,6 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB85_4 @@ -146403,10 +145828,10 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(13) @@ -146626,8 +146051,8 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v11, v11, v44 ; SI-NEXT: v_or_b32_e32 v12, v12, v43 ; SI-NEXT: v_or_b32_e32 v13, v13, v42 -; SI-NEXT: v_or_b32_e32 v14, v14, v40 -; SI-NEXT: v_or_b32_e32 v15, v15, v55 +; SI-NEXT: v_or_b32_e32 v14, v14, v55 +; SI-NEXT: v_or_b32_e32 v15, v15, v40 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr52 @@ -146648,8 +146073,8 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 @@ -146827,8 +146252,8 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v11, v44, v11 ; SI-NEXT: v_or_b32_e32 v12, v43, v12 ; SI-NEXT: v_or_b32_e32 v13, v42, v13 -; SI-NEXT: v_or_b32_e32 v14, v40, v14 -; SI-NEXT: v_or_b32_e32 v15, v55, v15 +; SI-NEXT: v_or_b32_e32 v14, v55, v14 +; SI-NEXT: v_or_b32_e32 v15, v40, v15 ; SI-NEXT: v_or_b32_e32 v19, v39, v19 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 @@ -147159,222 +146584,207 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v12 -; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v35, v8 -; SI-NEXT: v_mov_b32_e32 v38, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v45, v12 +; SI-NEXT: v_mov_b32_e32 v46, v10 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v57 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v32 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v55 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v54 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB87_2 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB87_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v7, v0, v48 +; SI-NEXT: v_mov_b32_e32 v60, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v7, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v8, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v39 +; SI-NEXT: v_or_b32_e32 v9, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v11, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v12, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v13, v0, v44 +; SI-NEXT: v_or_b32_e32 v10, v0, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v12, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v13, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v43 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v15, v0, v15 +; SI-NEXT: v_or_b32_e32 v15, v0, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v16, v0, v42 +; SI-NEXT: v_or_b32_e32 v16, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v18, v0, v41 +; SI-NEXT: v_or_b32_e32 v18, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v19, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v20, v0, v37 +; SI-NEXT: v_or_b32_e32 v20, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v22, v0, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 ; SI-NEXT: v_or_b32_e32 v24, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: v_or_b32_e32 v26, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 ; SI-NEXT: v_or_b32_e32 v27, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v28, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v29, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: v_or_b32_e32 v28, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: v_or_b32_e32 v29, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: s_or_b32 s12, s4, s5 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_or_b32_e32 v8, v1, v56 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v55, v61 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: v_or_b32_e32 v31, v0, v31 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_branch .LBB87_3 -; SI-NEXT: .LBB87_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v50, v37 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_mov_b32_e32 v38, v2 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v54 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v62, v52 -; SI-NEXT: v_mov_b32_e32 v60, v59 -; SI-NEXT: v_mov_b32_e32 v49, v51 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: .LBB87_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v58, v49 -; SI-NEXT: s_cbranch_vccnz .LBB87_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_mov_b32_e32 v54, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v48 +; SI-NEXT: v_mov_b32_e32 v61, v39 +; SI-NEXT: v_mov_b32_e32 v59, v37 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v53, v35 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v63, v56 +; SI-NEXT: v_or_b32_e32 v31, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: s_cbranch_execnz .LBB87_3 +; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -147419,139 +146829,134 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -147560,7 +146965,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB87_5: ; %end +; SI-NEXT: .LBB87_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -147577,8 +146982,26 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB87_4: +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v48 +; SI-NEXT: v_mov_b32_e32 v61, v39 +; SI-NEXT: v_mov_b32_e32 v60, v0 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v59, v37 +; SI-NEXT: v_mov_b32_e32 v57, v58 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v54, v4 +; SI-NEXT: v_mov_b32_e32 v53, v35 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v63, v56 +; SI-NEXT: v_mov_b32_e32 v47, v46 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: s_branch .LBB87_2 ; ; VI-LABEL: bitcast_v64i16_to_v16f64_scalar: ; VI: ; %bb.0: @@ -147915,252 +147338,214 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:280 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:276 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:272 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:268 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:264 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:260 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:256 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:252 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:248 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:244 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:240 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:236 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:232 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:228 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:224 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:220 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:216 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:152 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:148 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:144 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:140 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:136 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:132 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:128 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:124 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:120 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:116 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:112 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:108 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:104 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:100 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:96 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:92 -; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v143, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v152, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v153, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v154, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v155, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v156, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v171, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v172, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v173, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v174, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v175, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v184, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v185, s32 -; GFX11-NEXT: v_dual_mov_b32 v176, v13 :: v_dual_mov_b32 v177, v12 -; GFX11-NEXT: v_dual_mov_b32 v178, v11 :: v_dual_mov_b32 v179, v10 -; GFX11-NEXT: v_dual_mov_b32 v180, v9 :: v_dual_mov_b32 v181, v8 -; GFX11-NEXT: v_dual_mov_b32 v182, v7 :: v_dual_mov_b32 v183, v6 -; GFX11-NEXT: v_dual_mov_b32 v170, v5 :: v_dual_mov_b32 v171, v4 -; GFX11-NEXT: v_dual_mov_b32 v172, v3 :: v_dual_mov_b32 v173, v2 -; GFX11-NEXT: v_dual_mov_b32 v174, v1 :: v_dual_mov_b32 v175, v0 -; GFX11-NEXT: v_dual_mov_b32 v184, s28 :: v_dual_mov_b32 v185, s29 +; GFX11-NEXT: v_dual_mov_b32 v160, v13 :: v_dual_mov_b32 v161, v12 +; GFX11-NEXT: v_dual_mov_b32 v162, v11 :: v_dual_mov_b32 v163, v10 +; GFX11-NEXT: v_dual_mov_b32 v164, v9 :: v_dual_mov_b32 v165, v8 +; GFX11-NEXT: v_dual_mov_b32 v166, v7 :: v_dual_mov_b32 v167, v6 +; GFX11-NEXT: v_dual_mov_b32 v176, v5 :: v_dual_mov_b32 v177, v4 +; GFX11-NEXT: v_dual_mov_b32 v178, v3 :: v_dual_mov_b32 v179, v2 +; GFX11-NEXT: v_dual_mov_b32 v180, v1 :: v_dual_mov_b32 v181, v0 +; GFX11-NEXT: v_dual_mov_b32 v182, s28 :: v_dual_mov_b32 v183, s29 ; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:196 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:160 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:156 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:152 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:148 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:144 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:140 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:136 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:132 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:128 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:124 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:120 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:116 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:112 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:108 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:104 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:100 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:96 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v139, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v140, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v141, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v142, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v143, s32 ; GFX11-NEXT: s_cbranch_scc0 .LBB87_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_dual_mov_b32 v47, s0 :: v_dual_mov_b32 v52, s2 -; GFX11-NEXT: v_dual_mov_b32 v49, s1 :: v_dual_mov_b32 v56, s3 -; GFX11-NEXT: v_dual_mov_b32 v61, s16 :: v_dual_mov_b32 v74, s18 -; GFX11-NEXT: v_dual_mov_b32 v67, s17 :: v_dual_mov_b32 v82, s19 -; GFX11-NEXT: v_dual_mov_b32 v91, s20 :: v_dual_mov_b32 v112, s22 -; GFX11-NEXT: v_dual_mov_b32 v101, s21 :: v_dual_mov_b32 v124, s23 -; GFX11-NEXT: v_dual_mov_b32 v137, s24 :: v_dual_mov_b32 v14, s26 -; GFX11-NEXT: v_dual_mov_b32 v151, s25 :: v_dual_mov_b32 v30, s27 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v27, s18 +; GFX11-NEXT: v_dual_mov_b32 v20, s17 :: v_dual_mov_b32 v35, s19 +; GFX11-NEXT: v_dual_mov_b32 v44, s20 :: v_dual_mov_b32 v65, s22 +; GFX11-NEXT: v_dual_mov_b32 v54, s21 :: v_dual_mov_b32 v77, s23 +; GFX11-NEXT: v_dual_mov_b32 v90, s24 :: v_dual_mov_b32 v119, s26 +; GFX11-NEXT: v_dual_mov_b32 v104, s25 :: v_dual_mov_b32 v135, s27 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-NEXT: s_cbranch_vccnz .LBB87_3 ; GFX11-NEXT: .LBB87_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v30, s27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v135, s27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v119, s26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v104, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v90, s24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v77, s23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v65, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v54, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v44, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v35, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, s18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, s16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v160, v160, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v161, v161, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v162, v162, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v163, v163, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v164, v164, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v165, v165, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v166, v166, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v167, v167, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v176, v176, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v177, v177, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v178, v178, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v179, v179, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v180, v180, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v181, v181, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v183, v183, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v170, v170, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v171, v171, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v172, v172, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v173, v173, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v174, v174, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v175, v175, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v184, v184, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v151, s25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v137, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v124, s23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v112, s22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v101, s21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v91, s20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v82, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v74, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v67, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v61, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v56, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v49, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v47, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v182, v182, 3 op_sel_hi:[1,0] ; GFX11-NEXT: .LBB87_3: ; %end -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v0, v47 :: v_dual_mov_b32 v1, v49 -; GFX11-NEXT: v_dual_mov_b32 v3, v56 :: v_dual_mov_b32 v4, v61 -; GFX11-NEXT: v_dual_mov_b32 v6, v74 :: v_dual_mov_b32 v9, v101 -; GFX11-NEXT: v_dual_mov_b32 v7, v82 :: v_dual_mov_b32 v8, v91 -; GFX11-NEXT: v_dual_mov_b32 v11, v124 :: v_dual_mov_b32 v12, v137 -; GFX11-NEXT: v_dual_mov_b32 v15, v30 :: v_dual_mov_b32 v16, v184 -; GFX11-NEXT: v_dual_mov_b32 v17, v185 :: v_dual_mov_b32 v18, v175 -; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 -; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 -; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 +; GFX11-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 +; GFX11-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 +; GFX11-NEXT: v_mov_b32_e32 v13, v104 ; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v185, off, s32 -; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v174, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v173, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v172, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v171, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v170, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v169, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v168, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v159, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v158, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v157, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v156, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v155, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v154, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v153, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v152, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v143, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:212 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:216 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:220 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:224 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:228 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:232 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:236 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:240 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:268 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:272 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:276 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:280 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:284 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:288 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:292 -; GFX11-NEXT: v_dual_mov_b32 v2, v52 :: v_dual_mov_b32 v5, v67 -; GFX11-NEXT: v_dual_mov_b32 v10, v112 :: v_dual_mov_b32 v13, v151 -; GFX11-NEXT: v_dual_mov_b32 v25, v182 :: v_dual_mov_b32 v26, v181 -; GFX11-NEXT: v_dual_mov_b32 v27, v180 :: v_dual_mov_b32 v28, v179 -; GFX11-NEXT: v_dual_mov_b32 v29, v178 :: v_dual_mov_b32 v30, v177 -; GFX11-NEXT: v_mov_b32_e32 v31, v176 +; GFX11-NEXT: scratch_load_b32 v143, off, s32 +; GFX11-NEXT: scratch_load_b32 v142, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v141, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v140, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v139, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:124 +; GFX11-NEXT: s_clause 0x17 ; 96-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:220 +; GFX11-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14 +; GFX11-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 +; GFX11-NEXT: v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65 +; GFX11-NEXT: v_mov_b32_e32 v14, v119 +; GFX11-NEXT: v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v182 +; GFX11-NEXT: v_dual_mov_b32 v17, v183 :: v_dual_mov_b32 v18, v181 +; GFX11-NEXT: v_dual_mov_b32 v19, v180 :: v_dual_mov_b32 v20, v179 +; GFX11-NEXT: v_dual_mov_b32 v21, v178 :: v_dual_mov_b32 v22, v177 +; GFX11-NEXT: v_dual_mov_b32 v23, v176 :: v_dual_mov_b32 v24, v167 +; GFX11-NEXT: v_dual_mov_b32 v25, v166 :: v_dual_mov_b32 v26, v165 +; GFX11-NEXT: v_dual_mov_b32 v27, v164 :: v_dual_mov_b32 v28, v163 +; GFX11-NEXT: v_dual_mov_b32 v29, v162 :: v_dual_mov_b32 v30, v161 +; GFX11-NEXT: v_mov_b32_e32 v31, v160 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-NEXT: .LBB87_4: -; GFX11-NEXT: ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78 -; GFX11-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 ; GFX11-NEXT: ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88 -; GFX11-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93 -; GFX11-NEXT: ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106 -; GFX11-NEXT: ; implicit-def: $vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123 -; GFX11-NEXT: ; implicit-def: $vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156 -; GFX11-NEXT: ; implicit-def: $vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169 +; GFX11-NEXT: ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109 +; GFX11-NEXT: ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122 +; GFX11-NEXT: ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136 +; GFX11-NEXT: ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151 ; GFX11-NEXT: s_branch .LBB87_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -149547,7 +148932,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 @@ -149678,29 +149064,21 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v26, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v6, v1 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v6, v1 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v26, v6, v1 ; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v15 @@ -149708,6 +149086,13 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v4 @@ -149718,17 +149103,16 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v26 +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v26 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v27, v23 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 @@ -150337,22 +149721,22 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 @@ -150370,148 +149754,148 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v1 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:184 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v5 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v9 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v11 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v13 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v15 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 -; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v15 -; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v17 -; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v14 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v49 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v51 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v33 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v53 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v43 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 @@ -150519,506 +149903,410 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v3 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:340 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v3 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:348 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:356 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:364 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:372 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v0 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v61, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v0 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:44 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:20 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:12 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:60 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB88_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v28, v33, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v48, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v54, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v40, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v46, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v14, v37, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v25, v25, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v15, v39, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: v_or_b32_sdwa v26, v26, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v4, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v5, v5, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v63, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v60, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v10, v56, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v58, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v12, v63, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v34, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v35, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v38, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v25, v25, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v26, v26, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v36, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v50, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v30, v30, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v55, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v31, v31, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v42, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; kill: killed $vgpr32 @@ -151123,398 +150411,553 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: .LBB88_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB88_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v18, 0x300 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: v_or_b32_sdwa v29, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v0, 3, v46 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 3, v36 +; VI-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v4, 3, v33 +; VI-NEXT: v_add_u16_e32 v37, 3, v37 +; VI-NEXT: v_add_u16_e32 v35, 3, v35 +; VI-NEXT: v_add_u16_e32 v34, 3, v34 +; VI-NEXT: v_add_u16_e32 v3, 3, v50 +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v3, 3, v48 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v2, 3, v55 +; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v2, 3, v54 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v1, 3, v42 +; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v1, 3, v40 +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v57, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v4, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v0, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v29, 0x300, v29 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v9, 3, v9 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v9, 3, v9 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v2, 0x300, v3 -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v10, 3, v10 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v10, 3, v10 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v2, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v1, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v4 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v3, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_u16_e32 v4, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v37, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_u16_e32 v6, 3, v6 -; VI-NEXT: v_or_b32_sdwa v6, v33, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v11, 3, v11 +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v5, 3, v5 -; VI-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_add_u16_e32 v11, 3, v11 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v4, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v4, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v51, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v12, 3, v12 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v49, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v5, 3, v5 -; VI-NEXT: v_or_b32_sdwa v5, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v5, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v6, v5 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v12, 3, v12 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v6, 3, v6 -; VI-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v32, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v6, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v32, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v13, 3, v13 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v13, 3, v13 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v28, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 -; VI-NEXT: v_or_b32_e32 v28, v28, v32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v33, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v33, v33, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v14, 3, v14 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v27, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v27, 0x300, v27 -; VI-NEXT: v_or_b32_e32 v27, v27, v33 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v34, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v34, v34, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v26, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 -; VI-NEXT: v_or_b32_e32 v26, v26, v34 +; VI-NEXT: v_or_b32_sdwa v17, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v35, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v35, v35, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 -; VI-NEXT: v_or_b32_e32 v25, v25, v35 +; VI-NEXT: v_or_b32_sdwa v16, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v36, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 -; VI-NEXT: v_or_b32_e32 v6, v7, v6 -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v36, v36, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v24, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 -; VI-NEXT: v_or_b32_e32 v24, v24, v36 +; VI-NEXT: v_or_b32_sdwa v36, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v37, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v37, v37, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 3, v39 +; VI-NEXT: v_or_b32_sdwa v39, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v14, 3, v38 +; VI-NEXT: v_add_u16_e32 v38, 3, v63 +; VI-NEXT: v_mov_b32_e32 v63, 0x300 +; VI-NEXT: v_add_u16_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v19, v12, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v32 +; VI-NEXT: v_add_u16_sdwa v20, v11, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v10, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v22, v9, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v23, v8, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v27, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v28, v3, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v29, v2, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v30, v1, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v31, v0, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v33, v33, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v36, v36, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v39, v39, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v36 +; VI-NEXT: v_or_b32_e32 v16, v16, v33 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v14, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v39 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v23, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v7, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 -; VI-NEXT: v_or_b32_e32 v23, v23, v37 +; VI-NEXT: v_or_b32_sdwa v37, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v35, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v34, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v12, v34, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v38, v18, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v34, 3, v34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 -; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_u16_e32 v8, 3, v63 +; VI-NEXT: v_add_u16_e32 v48, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v9, 3, v62 -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v8, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 -; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_u16_e32 v9, 3, v61 +; VI-NEXT: v_or_b32_sdwa v48, v18, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v11, v48, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v10, 3, v60 -; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v9, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 -; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_u16_e32 v10, 3, v57 +; VI-NEXT: v_add_u16_e32 v49, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v11, 3, v56 -; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v10, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 -; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_u16_e32 v11, 3, v59 +; VI-NEXT: v_or_b32_sdwa v49, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v12, 3, v58 -; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v11, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 -; VI-NEXT: v_or_b32_e32 v11, v12, v11 -; VI-NEXT: v_add_u16_e32 v12, 3, v47 +; VI-NEXT: v_add_u16_e32 v50, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v53, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v50, v18, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v10, v50, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v13, 3, v46 -; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v12, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 -; VI-NEXT: v_or_b32_e32 v12, v13, v12 -; VI-NEXT: v_add_u16_e32 v13, 3, v45 +; VI-NEXT: v_add_u16_e32 v51, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v52, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v24, v7, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v14, 3, v44 -; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v13, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 -; VI-NEXT: v_or_b32_e32 v13, v14, v13 -; VI-NEXT: v_add_u16_e32 v14, 3, v43 +; VI-NEXT: v_or_b32_sdwa v51, v18, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v15, 3, v42 -; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v14, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 -; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: v_add_u16_e32 v52, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v52, v18, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v9, v52, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v16, 3, v16 -; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_add_u16_e32 v53, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v53, v18, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v17, 3, v17 -; VI-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: v_add_u16_e32 v54, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v19, 3, v19 -; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v54, v18, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v8, v54, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v20, 3, v20 -; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v21, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v17, v17, v21 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v16, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v19, 0x300, v20 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_e32 v16, v19, v16 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v55, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v55, v18, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v43, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v21, 3, v21 -; VI-NEXT: v_or_b32_sdwa v30, v39, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v30, 0x300, v30 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u16_e32 v20, 3, v20 -; VI-NEXT: v_or_b32_sdwa v31, v51, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v31, 0x300, v31 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u16_e32 v21, 3, v21 -; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v40, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v29, v29, v40 +; VI-NEXT: v_add_u16_e32 v40, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v38, v38, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v38, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v40, v18, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v41, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v25, v6, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v7, v40, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v20, 3, v20 -; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v55, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 -; VI-NEXT: v_or_b32_e32 v22, v22, v38 -; VI-NEXT: v_or_b32_e32 v30, v30, v55 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v39, 3, v39 -; VI-NEXT: v_or_b32_sdwa v39, v48, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v21, v39, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v48, 3, v48 -; VI-NEXT: v_or_b32_sdwa v48, v49, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v49, 3, v49 -; VI-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v20, v49, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v50, 3, v50 -; VI-NEXT: v_or_b32_sdwa v50, v51, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v19, 3, v19 -; VI-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v41, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v41, v18, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v45, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v39, 3, v39 -; VI-NEXT: v_or_b32_sdwa v39, v49, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v39, 0x300, v39 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v51, 3, v51 -; VI-NEXT: v_or_b32_sdwa v51, v52, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v52, 3, v52 -; VI-NEXT: v_or_b32_sdwa v52, v53, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v53, 3, v53 -; VI-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v54, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v19, v51, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v18, v53, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v18, v39, v18 -; VI-NEXT: v_add_u16_e32 v39, 0x300, v52 -; VI-NEXT: v_or_b32_e32 v19, v39, v19 -; VI-NEXT: v_add_u16_e32 v39, 0x300, v50 -; VI-NEXT: v_or_b32_e32 v20, v39, v20 -; VI-NEXT: v_add_u16_e32 v39, 0x300, v48 -; VI-NEXT: v_or_b32_e32 v21, v39, v21 -; VI-NEXT: v_or_b32_e32 v31, v31, v54 +; VI-NEXT: v_add_u16_e32 v42, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v42, v18, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v44, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v26, v5, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v6, v42, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v43, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v43, v18, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v44, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v44, v18, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v5, v44, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v45, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v45, v18, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v46, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v46, v18, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v4, v46, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v47, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v47, v18, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v56, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v56, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v3, v56, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v57, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v57, v18, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v58, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v58, v18, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v2, v58, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v59, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v59, v18, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v60, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v60, v18, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v1, v60, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v61, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v61, v18, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v62, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v62, v18, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v18, v13, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v13, v37, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v18, v32, v18 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v0, v62, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v34, v37, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v19, v32, v19 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v0, v34, v0 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v61 +; VI-NEXT: v_or_b32_e32 v1, v34, v1 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v59 +; VI-NEXT: v_or_b32_e32 v2, v34, v2 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v57 +; VI-NEXT: v_or_b32_e32 v3, v34, v3 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v47 +; VI-NEXT: v_or_b32_e32 v4, v34, v4 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v45 +; VI-NEXT: v_or_b32_e32 v5, v34, v5 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v43 +; VI-NEXT: v_or_b32_e32 v6, v34, v6 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v41 +; VI-NEXT: v_or_b32_e32 v7, v34, v7 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v55 +; VI-NEXT: v_or_b32_e32 v8, v34, v8 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v53 +; VI-NEXT: v_or_b32_e32 v9, v34, v9 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v51 +; VI-NEXT: v_or_b32_e32 v10, v34, v10 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v49 +; VI-NEXT: v_or_b32_e32 v11, v34, v11 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v38 +; VI-NEXT: v_or_b32_e32 v12, v34, v12 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v35 +; VI-NEXT: v_or_b32_e32 v13, v34, v13 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v20, v32, v20 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v21, v32, v21 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v22, v32, v22 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v23, v32, v23 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v24, v32, v24 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v25, v32, v25 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v26, v32, v26 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v27, v32, v27 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v28, v32, v28 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v29, v32, v29 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v30, v32, v30 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v31, v32, v31 ; VI-NEXT: .LBB88_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload @@ -151555,22 +150998,22 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 @@ -151589,192 +151032,186 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v1 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:184 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v9 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v11 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v17 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v23 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v25 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v11 -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v13 -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v15 -; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v17 -; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v19 -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v23 -; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v25 -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v37 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v48 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v33 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v49 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v52 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v34 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v35 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:156 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v53 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v42 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v43 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v38 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -151782,26 +151219,24 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -151809,460 +151244,373 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:324 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:340 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v47, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:348 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:356 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:364 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:372 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v1 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:44 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:84 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB88_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_or_b32_sdwa v28, v38, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v48, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v54, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v40, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v32, v45, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v21, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v62, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v10, v57, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v59, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v13, v63, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v14, v34, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v43, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v16, v17, v16, s6 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v17, v19, v18, s6 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v18, v35, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v17, v18, v17, s6 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v18, v19, v18, s6 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v36, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v19, v20, v19, s6 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v20, v21, v20, s6 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v21, v22, v21, s6 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v22, v23, v22, s6 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v49, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v52, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v30, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v41, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v30, v31, v30, s6 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; kill: killed $vgpr32 @@ -152365,403 +151713,537 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: .LBB88_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB88_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u16_e32 v0, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v30, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v25, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v26, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v27, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v28, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v49 +; GFX9-NEXT: v_mov_b32_e32 v1, v35 +; GFX9-NEXT: v_or_b32_sdwa v35, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v48 +; GFX9-NEXT: v_mov_b32_e32 v2, v36 +; GFX9-NEXT: v_or_b32_sdwa v36, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v3, v37 +; GFX9-NEXT: v_mov_b32_e32 v4, v39 ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v2 -; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v3 -; GFX9-NEXT: v_perm_b32 v0, v2, v0, s6 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v37, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v39, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 -; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s6 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 -; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v38, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 -; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 -; GFX9-NEXT: v_add_u16_e32 v35, 0x300, v25 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v48, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v38, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v23, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v24, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v18, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v19, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v20, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v21, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v36, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v22 -; GFX9-NEXT: v_add_u16_e32 v36, 0x300, v36 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v22, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 -; GFX9-NEXT: v_or_b32_sdwa v6, v33, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 -; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v23, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v37, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v33, 0x300, v21 -; GFX9-NEXT: v_add_u16_e32 v34, 0x300, v23 -; GFX9-NEXT: v_perm_b32 v29, v34, v29, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 -; GFX9-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_or_b32_sdwa v49, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v49, 0x300, v49 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v50, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v38, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v51, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v52, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v53, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v39, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 -; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v39 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v54, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v48, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v40, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 -; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v41, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v43, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v45, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v46, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v47, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v56, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v57, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v58, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v34, 3, v34 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 -; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 -; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v60 -; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 -; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 -; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v57 +; GFX9-NEXT: v_or_b32_sdwa v59, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v63 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v56 -; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 -; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 -; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v59 +; GFX9-NEXT: v_or_b32_sdwa v60, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v58 -; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 -; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 -; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v61, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v46 -; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 -; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 -; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v62, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 -; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 -; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v63, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v15, 3, v42 -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 -; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 -; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: v_or_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 -; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 -; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 +; GFX9-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v16 -; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v18 -; GFX9-NEXT: v_perm_b32 v17, v17, v20, s6 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v19 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v16, v18, v16, s6 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v49, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v20 -; GFX9-NEXT: v_perm_b32 v30, v33, v30, s6 +; GFX9-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v50, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v52, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v39, 0x300, v50 -; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v18 +; GFX9-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v51, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v51 +; GFX9-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v52, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v32, 0x300, v19 -; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 +; GFX9-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v53, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v53 +; GFX9-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v54, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_perm_b32 v6, v6, v7, s6 +; GFX9-NEXT: v_perm_b32 v7, v8, v9, s6 +; GFX9-NEXT: v_perm_b32 v8, v14, v15, s6 +; GFX9-NEXT: v_perm_b32 v9, v10, v11, s6 +; GFX9-NEXT: v_perm_b32 v10, v12, v13, s6 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v55, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v55 +; GFX9-NEXT: v_or_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v40, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v50, 0x300, v40 -; GFX9-NEXT: v_perm_b32 v21, v50, v21, s6 +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v41, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v41 +; GFX9-NEXT: v_or_b32_sdwa v42, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v42, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v51, 0x300, v42 -; GFX9-NEXT: v_perm_b32 v20, v51, v20, s6 +; GFX9-NEXT: v_or_b32_sdwa v44, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v43, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v43 +; GFX9-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v44, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 +; GFX9-NEXT: v_perm_b32 v3, v31, v3, s6 +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v27 +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v23 +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v50 +; GFX9-NEXT: v_add_u16_e32 v50, 0x300, v51 +; GFX9-NEXT: v_add_u16_e32 v51, 0x300, v53 +; GFX9-NEXT: v_add_u16_e32 v53, 0x300, v41 +; GFX9-NEXT: v_add_u16_e32 v41, 0x300, v59 +; GFX9-NEXT: v_add_u16_e32 v59, 0x300, v60 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v45, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v45 +; GFX9-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v46, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v24 -; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v26 -; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v37 +; GFX9-NEXT: v_or_b32_sdwa v32, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v32, 0x300, v32 +; GFX9-NEXT: v_perm_b32 v2, v32, v2, s6 +; GFX9-NEXT: v_add_u16_e32 v32, 0x300, v25 +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v20 +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v40 +; GFX9-NEXT: v_add_u16_e32 v40, 0x300, v57 +; GFX9-NEXT: v_add_u16_e32 v57, 0x300, v17 +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v44 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v33, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v33, 0x300, v33 +; GFX9-NEXT: v_perm_b32 v1, v33, v1, s6 +; GFX9-NEXT: v_add_u16_e32 v33, 0x300, v26 +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v18 +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v46 +; GFX9-NEXT: v_add_u16_e32 v46, 0x300, v16 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v4 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v42 +; GFX9-NEXT: v_perm_b32 v4, v17, v4, s6 +; GFX9-NEXT: v_perm_b32 v5, v16, v5, s6 +; GFX9-NEXT: v_perm_b32 v11, v46, v57, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v34, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v34, 0x300, v34 +; GFX9-NEXT: v_perm_b32 v0, v34, v0, s6 +; GFX9-NEXT: v_add_u16_e32 v34, 0x300, v28 +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v48 +; GFX9-NEXT: v_add_u16_e32 v48, 0x300, v21 +; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v54 +; GFX9-NEXT: v_add_u16_e32 v54, 0x300, v45 +; GFX9-NEXT: v_add_u16_e32 v45, 0x300, v63 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v30 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v35 +; GFX9-NEXT: v_add_u16_e32 v35, 0x300, v36 +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v37 +; GFX9-NEXT: v_add_u16_e32 v36, 0x300, v39 ; GFX9-NEXT: v_add_u16_e32 v37, 0x300, v38 -; GFX9-NEXT: v_add_u16_e32 v38, 0x300, v48 -; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v49 -; GFX9-NEXT: v_add_u16_e32 v48, 0x300, v52 -; GFX9-NEXT: v_add_u16_e32 v49, 0x300, v54 -; GFX9-NEXT: v_add_u16_e32 v52, 0x300, v44 -; GFX9-NEXT: v_add_u16_e32 v53, 0x300, v46 -; GFX9-NEXT: v_perm_b32 v18, v53, v18, s6 -; GFX9-NEXT: v_perm_b32 v19, v52, v19, s6 -; GFX9-NEXT: v_perm_b32 v22, v49, v22, s6 -; GFX9-NEXT: v_perm_b32 v23, v48, v23, s6 -; GFX9-NEXT: v_perm_b32 v24, v39, v24, s6 -; GFX9-NEXT: v_perm_b32 v25, v38, v25, s6 -; GFX9-NEXT: v_perm_b32 v26, v37, v26, s6 -; GFX9-NEXT: v_perm_b32 v27, v36, v27, s6 -; GFX9-NEXT: v_perm_b32 v28, v35, v28, s6 +; GFX9-NEXT: v_add_u16_e32 v38, 0x300, v24 +; GFX9-NEXT: v_add_u16_e32 v39, 0x300, v19 +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v22 +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v52 +; GFX9-NEXT: v_add_u16_e32 v52, 0x300, v55 +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v43 +; GFX9-NEXT: v_add_u16_e32 v55, 0x300, v47 +; GFX9-NEXT: v_perm_b32 v16, v55, v18, s6 +; GFX9-NEXT: v_perm_b32 v17, v54, v19, s6 +; GFX9-NEXT: v_perm_b32 v18, v53, v20, s6 +; GFX9-NEXT: v_perm_b32 v19, v52, v21, s6 +; GFX9-NEXT: v_perm_b32 v20, v51, v22, s6 +; GFX9-NEXT: v_perm_b32 v21, v50, v23, s6 +; GFX9-NEXT: v_perm_b32 v22, v49, v24, s6 +; GFX9-NEXT: v_perm_b32 v23, v48, v25, s6 +; GFX9-NEXT: v_perm_b32 v24, v39, v26, s6 +; GFX9-NEXT: v_perm_b32 v25, v38, v27, s6 +; GFX9-NEXT: v_perm_b32 v26, v37, v28, s6 +; GFX9-NEXT: v_perm_b32 v27, v36, v29, s6 +; GFX9-NEXT: v_perm_b32 v28, v35, v30, s6 +; GFX9-NEXT: v_perm_b32 v29, v34, v31, s6 +; GFX9-NEXT: v_perm_b32 v30, v33, v32, s6 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v47, 0x300, v56 +; GFX9-NEXT: v_add_u16_e32 v56, 0x300, v58 +; GFX9-NEXT: v_add_u16_e32 v43, 0x300, v61 +; GFX9-NEXT: v_add_u16_e32 v58, 0x300, v62 +; GFX9-NEXT: v_perm_b32 v12, v45, v58, s6 +; GFX9-NEXT: v_perm_b32 v13, v43, v59, s6 +; GFX9-NEXT: v_perm_b32 v14, v41, v56, s6 +; GFX9-NEXT: v_perm_b32 v15, v40, v47, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 ; GFX9-NEXT: .LBB88_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload @@ -154443,7 +153925,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 @@ -154453,13 +153934,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 -; SI-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s73, s21 +; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; SI-NEXT: s_mov_b32 s88, s24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v44, s19, 0 -; SI-NEXT: v_writelane_b32 v44, s18, 1 -; SI-NEXT: v_writelane_b32 v44, s17, 2 -; SI-NEXT: v_writelane_b32 v44, s16, 3 +; SI-NEXT: v_writelane_b32 v43, s28, 0 +; SI-NEXT: v_writelane_b32 v43, s27, 1 +; SI-NEXT: v_writelane_b32 v43, s25, 2 +; SI-NEXT: v_writelane_b32 v43, s19, 3 +; SI-NEXT: v_writelane_b32 v43, s18, 4 +; SI-NEXT: v_writelane_b32 v43, s17, 5 +; SI-NEXT: v_writelane_b32 v43, s16, 6 ; SI-NEXT: v_writelane_b32 v41, s30, 0 ; SI-NEXT: v_writelane_b32 v41, s31, 1 ; SI-NEXT: v_writelane_b32 v41, s34, 2 @@ -154479,14 +153963,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s64, 16 ; SI-NEXT: v_writelane_b32 v41, s65, 17 ; SI-NEXT: v_writelane_b32 v41, s66, 18 +; SI-NEXT: s_mov_b32 s75, s26 ; SI-NEXT: v_writelane_b32 v41, s67, 19 ; SI-NEXT: v_writelane_b32 v41, s68, 20 ; SI-NEXT: v_writelane_b32 v41, s69, 21 ; SI-NEXT: v_writelane_b32 v41, s70, 22 ; SI-NEXT: v_writelane_b32 v41, s71, 23 -; SI-NEXT: s_mov_b32 s74, s29 -; SI-NEXT: s_mov_b32 s78, s28 -; SI-NEXT: s_mov_b32 s76, s27 +; SI-NEXT: s_mov_b32 s59, s29 +; SI-NEXT: s_mov_b32 s13, s21 ; SI-NEXT: v_writelane_b32 v41, s80, 24 ; SI-NEXT: v_writelane_b32 v41, s81, 25 ; SI-NEXT: v_writelane_b32 v41, s82, 26 @@ -154496,105 +153980,99 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s86, 30 ; SI-NEXT: v_writelane_b32 v41, s87, 31 ; SI-NEXT: v_writelane_b32 v41, s96, 32 -; SI-NEXT: s_mov_b32 s47, s26 ; SI-NEXT: v_writelane_b32 v41, s97, 33 -; SI-NEXT: v_writelane_b32 v41, s98, 34 -; SI-NEXT: v_writelane_b32 v41, s99, 35 +; SI-NEXT: s_mov_b32 s6, s23 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 -; SI-NEXT: v_readfirstlane_b32 s37, v22 -; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s38, v20 -; SI-NEXT: v_writelane_b32 v43, s37, 0 -; SI-NEXT: v_readfirstlane_b32 s39, v19 -; SI-NEXT: v_writelane_b32 v43, s38, 1 -; SI-NEXT: v_readfirstlane_b32 s48, v25 -; SI-NEXT: v_writelane_b32 v43, s39, 2 -; SI-NEXT: v_readfirstlane_b32 s49, v26 -; SI-NEXT: v_writelane_b32 v43, s48, 3 -; SI-NEXT: v_readfirstlane_b32 s50, v24 -; SI-NEXT: v_writelane_b32 v43, s49, 4 -; SI-NEXT: v_readfirstlane_b32 s51, v23 -; SI-NEXT: v_writelane_b32 v43, s50, 5 -; SI-NEXT: v_readfirstlane_b32 s52, v29 -; SI-NEXT: v_writelane_b32 v43, s51, 6 -; SI-NEXT: v_readfirstlane_b32 s53, v30 -; SI-NEXT: v_writelane_b32 v43, s52, 7 -; SI-NEXT: v_readfirstlane_b32 s54, v28 -; SI-NEXT: v_writelane_b32 v43, s53, 8 -; SI-NEXT: v_readfirstlane_b32 s55, v27 -; SI-NEXT: v_writelane_b32 v43, s54, 9 -; SI-NEXT: v_writelane_b32 v43, s55, 10 -; SI-NEXT: s_mov_b32 s57, s24 +; SI-NEXT: v_readfirstlane_b32 s50, v26 +; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s51, v24 +; SI-NEXT: v_writelane_b32 v42, s50, 0 +; SI-NEXT: v_readfirstlane_b32 s52, v23 +; SI-NEXT: v_writelane_b32 v42, s51, 1 +; SI-NEXT: v_readfirstlane_b32 s53, v29 +; SI-NEXT: v_writelane_b32 v42, s52, 2 +; SI-NEXT: v_readfirstlane_b32 s54, v30 +; SI-NEXT: v_writelane_b32 v42, s53, 3 +; SI-NEXT: v_readfirstlane_b32 s55, v28 +; SI-NEXT: v_writelane_b32 v42, s54, 4 +; SI-NEXT: v_readfirstlane_b32 s64, v27 +; SI-NEXT: v_writelane_b32 v42, s55, 5 +; SI-NEXT: v_writelane_b32 v42, s64, 6 +; SI-NEXT: v_writelane_b32 v41, s98, 34 +; SI-NEXT: v_writelane_b32 v41, s99, 35 ; SI-NEXT: v_readfirstlane_b32 s16, v1 ; SI-NEXT: v_readfirstlane_b32 s17, v2 +; SI-NEXT: v_readfirstlane_b32 s18, v5 +; SI-NEXT: v_readfirstlane_b32 s19, v6 +; SI-NEXT: v_readfirstlane_b32 s76, v4 +; SI-NEXT: v_readfirstlane_b32 s78, v3 +; SI-NEXT: v_readfirstlane_b32 s91, v9 +; SI-NEXT: v_readfirstlane_b32 s92, v10 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s6, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 +; SI-NEXT: v_writelane_b32 v43, s4, 7 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_writelane_b32 v44, s4, 4 +; SI-NEXT: v_writelane_b32 v43, s4, 8 ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v44, s4, 5 +; SI-NEXT: v_writelane_b32 v43, s4, 9 ; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v44, s4, 6 +; SI-NEXT: v_writelane_b32 v43, s4, 10 ; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v44, s4, 7 +; SI-NEXT: v_writelane_b32 v43, s4, 11 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v44, s4, 8 +; SI-NEXT: v_writelane_b32 v43, s4, 12 ; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v43, s4, 13 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v43, s4, 14 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v44, s4, 9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v44, s4, 10 -; SI-NEXT: v_readfirstlane_b32 s18, v5 -; SI-NEXT: v_readfirstlane_b32 s19, v6 -; SI-NEXT: v_readfirstlane_b32 s77, v4 -; SI-NEXT: v_readfirstlane_b32 s89, v3 -; SI-NEXT: v_readfirstlane_b32 s90, v9 -; SI-NEXT: v_readfirstlane_b32 s91, v10 -; SI-NEXT: v_readfirstlane_b32 s92, v8 -; SI-NEXT: v_readfirstlane_b32 s93, v7 -; SI-NEXT: v_readfirstlane_b32 s94, v13 -; SI-NEXT: v_readfirstlane_b32 s95, v14 -; SI-NEXT: v_readfirstlane_b32 s30, v17 -; SI-NEXT: v_readfirstlane_b32 s31, v18 -; SI-NEXT: v_readfirstlane_b32 s34, v16 -; SI-NEXT: v_readfirstlane_b32 s35, v15 -; SI-NEXT: v_readfirstlane_b32 s36, v21 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s24, v40 +; SI-NEXT: v_readfirstlane_b32 s93, v8 +; SI-NEXT: v_readfirstlane_b32 s94, v7 +; SI-NEXT: v_readfirstlane_b32 s95, v13 +; SI-NEXT: v_readfirstlane_b32 s30, v11 +; SI-NEXT: v_readfirstlane_b32 s31, v17 +; SI-NEXT: v_readfirstlane_b32 s34, v18 +; SI-NEXT: v_readfirstlane_b32 s35, v16 +; SI-NEXT: v_readfirstlane_b32 s36, v15 +; SI-NEXT: v_readfirstlane_b32 s37, v21 +; SI-NEXT: v_readfirstlane_b32 s38, v22 +; SI-NEXT: v_readfirstlane_b32 s39, v20 +; SI-NEXT: v_readfirstlane_b32 s48, v19 +; SI-NEXT: v_readfirstlane_b32 s49, v25 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v44, s4, 11 +; SI-NEXT: v_writelane_b32 v43, s4, 15 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v44, s4, 12 +; SI-NEXT: v_writelane_b32 v43, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v44, s4, 13 +; SI-NEXT: v_writelane_b32 v43, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v44, s4, 14 +; SI-NEXT: v_writelane_b32 v43, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v44, s4, 15 +; SI-NEXT: v_writelane_b32 v43, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 @@ -154605,39 +154083,38 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s75, v32 +; SI-NEXT: v_readfirstlane_b32 s45, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s21, v33 +; SI-NEXT: v_readfirstlane_b32 s46, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v44, s4, 16 +; SI-NEXT: v_writelane_b32 v43, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_readfirstlane_b32 s43, v34 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s40, v35 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s61, v36 +; SI-NEXT: v_readfirstlane_b32 s42, v36 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s63, v37 +; SI-NEXT: v_readfirstlane_b32 s62, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v44, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s59, v31 +; SI-NEXT: v_readfirstlane_b32 s61, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s56, v38 +; SI-NEXT: v_readfirstlane_b32 s90, v38 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s43, v39 +; SI-NEXT: v_readfirstlane_b32 s63, v39 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s46, v48 +; SI-NEXT: v_readfirstlane_b32 s24, v48 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s42, v49 +; SI-NEXT: v_readfirstlane_b32 s4, v49 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s13, v50 +; SI-NEXT: v_readfirstlane_b32 s60, v50 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s45, v51 +; SI-NEXT: v_readfirstlane_b32 s10, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 @@ -154645,45 +154122,40 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s88, v32 +; SI-NEXT: v_writelane_b32 v43, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s79, v33 +; SI-NEXT: v_readfirstlane_b32 s58, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 +; SI-NEXT: v_readfirstlane_b32 s26, v32 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v44, s4, 18 +; SI-NEXT: v_readfirstlane_b32 s27, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v44, s4, 19 +; SI-NEXT: v_readfirstlane_b32 s72, v35 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v44, s4, 20 +; SI-NEXT: v_readfirstlane_b32 s79, v36 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_writelane_b32 v44, s4, 21 +; SI-NEXT: v_readfirstlane_b32 s66, v37 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v44, s4, 22 +; SI-NEXT: v_readfirstlane_b32 s44, v31 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v44, s4, 23 +; SI-NEXT: v_writelane_b32 v43, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v44, s4, 24 +; SI-NEXT: v_writelane_b32 v43, s4, 23 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v44, s4, 25 +; SI-NEXT: v_writelane_b32 v43, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v44, s4, 26 +; SI-NEXT: v_writelane_b32 v43, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v44, s4, 27 +; SI-NEXT: v_writelane_b32 v43, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s4, v51 -; SI-NEXT: v_writelane_b32 v44, s4, 28 +; SI-NEXT: v_writelane_b32 v43, s4, 27 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(3) @@ -154699,55 +154171,52 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: v_writelane_b32 v44, s4, 29 +; SI-NEXT: v_writelane_b32 v43, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: v_writelane_b32 v44, s4, 30 +; SI-NEXT: v_writelane_b32 v43, s4, 29 ; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: v_writelane_b32 v44, s4, 31 +; SI-NEXT: v_writelane_b32 v43, s4, 30 ; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: v_writelane_b32 v44, s4, 32 +; SI-NEXT: v_writelane_b32 v43, s4, 31 ; SI-NEXT: v_readfirstlane_b32 s4, v55 -; SI-NEXT: v_writelane_b32 v44, s4, 33 -; SI-NEXT: v_writelane_b32 v44, s22, 34 -; SI-NEXT: v_writelane_b32 v44, s23, 35 -; SI-NEXT: v_writelane_b32 v44, s73, 36 -; SI-NEXT: v_writelane_b32 v44, s20, 37 -; SI-NEXT: v_writelane_b32 v44, s47, 38 -; SI-NEXT: v_writelane_b32 v44, s76, 39 -; SI-NEXT: v_writelane_b32 v44, s25, 40 -; SI-NEXT: v_writelane_b32 v44, s57, 41 -; SI-NEXT: v_writelane_b32 v44, s74, 42 -; SI-NEXT: v_writelane_b32 v44, s78, 43 -; SI-NEXT: v_writelane_b32 v44, s24, 44 -; SI-NEXT: v_writelane_b32 v44, s16, 45 -; SI-NEXT: v_writelane_b32 v44, s17, 46 -; SI-NEXT: v_writelane_b32 v44, s18, 47 -; SI-NEXT: v_writelane_b32 v44, s19, 48 -; SI-NEXT: v_writelane_b32 v44, s77, 49 -; SI-NEXT: v_writelane_b32 v44, s89, 50 -; SI-NEXT: v_writelane_b32 v44, s90, 51 -; SI-NEXT: v_writelane_b32 v44, s91, 52 -; SI-NEXT: v_writelane_b32 v44, s92, 53 -; SI-NEXT: v_writelane_b32 v44, s93, 54 -; SI-NEXT: v_writelane_b32 v44, s94, 55 -; SI-NEXT: v_writelane_b32 v44, s95, 56 +; SI-NEXT: v_writelane_b32 v43, s4, 32 +; SI-NEXT: v_readfirstlane_b32 s4, v40 +; SI-NEXT: v_writelane_b32 v43, s4, 33 +; SI-NEXT: v_writelane_b32 v43, s22, 34 +; SI-NEXT: v_writelane_b32 v43, s6, 35 +; SI-NEXT: v_writelane_b32 v43, s13, 36 +; SI-NEXT: v_writelane_b32 v43, s20, 37 +; SI-NEXT: v_writelane_b32 v43, s75, 38 +; SI-NEXT: v_writelane_b32 v43, s88, 39 +; SI-NEXT: v_writelane_b32 v43, s59, 40 +; SI-NEXT: v_writelane_b32 v43, s16, 41 +; SI-NEXT: v_writelane_b32 v43, s17, 42 +; SI-NEXT: v_writelane_b32 v43, s18, 43 +; SI-NEXT: v_writelane_b32 v43, s19, 44 +; SI-NEXT: v_writelane_b32 v43, s76, 45 +; SI-NEXT: v_writelane_b32 v43, s78, 46 +; SI-NEXT: v_writelane_b32 v43, s91, 47 +; SI-NEXT: v_writelane_b32 v43, s92, 48 +; SI-NEXT: v_writelane_b32 v43, s93, 49 +; SI-NEXT: v_writelane_b32 v43, s94, 50 +; SI-NEXT: v_writelane_b32 v43, s95, 51 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s58, v33 +; SI-NEXT: v_readfirstlane_b32 s47, v33 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s10, v34 +; SI-NEXT: v_readfirstlane_b32 s21, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s66, v35 -; SI-NEXT: v_readfirstlane_b32 s28, v31 -; SI-NEXT: v_readfirstlane_b32 s27, v32 +; SI-NEXT: v_readfirstlane_b32 s25, v35 +; SI-NEXT: v_readfirstlane_b32 s89, v31 +; SI-NEXT: v_readfirstlane_b32 s11, v32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s29, v36 +; SI-NEXT: v_readfirstlane_b32 s57, v36 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s69, v37 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s14, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s68, v39 +; SI-NEXT: v_readfirstlane_b32 s29, v39 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 @@ -154760,7 +154229,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s11, v49 +; SI-NEXT: v_readfirstlane_b32 s68, v49 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s70, v50 ; SI-NEXT: s_waitcnt vmcnt(9) @@ -154770,33 +154239,38 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 vcc_lo, v12 -; SI-NEXT: v_readfirstlane_b32 vcc_hi, v11 -; SI-NEXT: v_writelane_b32 v44, vcc_lo, 57 -; SI-NEXT: v_writelane_b32 v44, vcc_hi, 58 -; SI-NEXT: v_writelane_b32 v44, s30, 59 -; SI-NEXT: v_writelane_b32 v44, s31, 60 -; SI-NEXT: v_writelane_b32 v44, s34, 61 -; SI-NEXT: v_writelane_b32 v44, s35, 62 -; SI-NEXT: v_writelane_b32 v44, s36, 63 +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v14 +; SI-NEXT: v_readfirstlane_b32 vcc_hi, v12 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 52 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 53 +; SI-NEXT: v_writelane_b32 v43, s30, 54 +; SI-NEXT: v_writelane_b32 v43, s31, 55 +; SI-NEXT: v_writelane_b32 v43, s34, 56 +; SI-NEXT: v_writelane_b32 v43, s35, 57 +; SI-NEXT: v_writelane_b32 v43, s36, 58 +; SI-NEXT: v_writelane_b32 v43, s37, 59 +; SI-NEXT: v_writelane_b32 v43, s38, 60 +; SI-NEXT: v_writelane_b32 v43, s39, 61 +; SI-NEXT: v_writelane_b32 v43, s48, 62 +; SI-NEXT: v_writelane_b32 v43, s49, 63 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s60, v31 +; SI-NEXT: v_readfirstlane_b32 s28, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s62, v32 +; SI-NEXT: v_readfirstlane_b32 s23, v32 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s83, v33 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s98, v34 +; SI-NEXT: v_readfirstlane_b32 s74, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s81, v35 +; SI-NEXT: v_readfirstlane_b32 s97, v35 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s72, v36 +; SI-NEXT: v_readfirstlane_b32 s73, v36 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s87, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s99, v38 +; SI-NEXT: v_readfirstlane_b32 s8, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s82, v39 +; SI-NEXT: v_readfirstlane_b32 s15, v39 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 @@ -154806,13 +154280,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s26, v48 +; SI-NEXT: v_readfirstlane_b32 s77, v48 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s15, v49 +; SI-NEXT: v_readfirstlane_b32 s96, v49 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s96, v50 +; SI-NEXT: v_readfirstlane_b32 s7, v50 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s7, v51 +; SI-NEXT: v_readfirstlane_b32 s80, v51 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 @@ -154821,453 +154295,454 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s41, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s97, v32 +; SI-NEXT: v_readfirstlane_b32 s98, v32 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s44, v33 +; SI-NEXT: v_readfirstlane_b32 s56, v33 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s9, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s80, v35 +; SI-NEXT: v_readfirstlane_b32 s82, v35 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s86, v36 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s85, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s8, v38 +; SI-NEXT: v_readfirstlane_b32 s99, v38 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_readfirstlane_b32 s12, v39 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s65, v48 +; SI-NEXT: v_readfirstlane_b32 s67, v48 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s64, v49 -; SI-NEXT: v_writelane_b32 v43, s64, 11 +; SI-NEXT: v_readfirstlane_b32 s65, v49 +; SI-NEXT: v_writelane_b32 v42, s65, 7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s67, v50 -; SI-NEXT: v_writelane_b32 v43, s65, 12 +; SI-NEXT: v_readfirstlane_b32 s81, v50 +; SI-NEXT: v_writelane_b32 v42, s67, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s84, v51 -; SI-NEXT: v_writelane_b32 v43, s67, 13 -; SI-NEXT: v_writelane_b32 v43, s84, 14 -; SI-NEXT: v_writelane_b32 v43, s85, 15 -; SI-NEXT: v_writelane_b32 v43, s86, 16 -; SI-NEXT: v_writelane_b32 v43, s87, 17 -; SI-NEXT: v_writelane_b32 v43, s8, 18 -; SI-NEXT: v_writelane_b32 v43, s99, 19 -; SI-NEXT: v_writelane_b32 v43, s12, 20 -; SI-NEXT: v_writelane_b32 v43, s44, 21 -; SI-NEXT: v_writelane_b32 v43, s97, 22 -; SI-NEXT: v_writelane_b32 v43, s15, 23 -; SI-NEXT: v_writelane_b32 v43, s96, 24 -; SI-NEXT: v_writelane_b32 v43, s98, 25 -; SI-NEXT: v_writelane_b32 v43, s83, 26 -; SI-NEXT: v_writelane_b32 v43, s82, 27 -; SI-NEXT: v_writelane_b32 v43, s9, 28 -; SI-NEXT: v_writelane_b32 v43, s81, 29 -; SI-NEXT: v_writelane_b32 v43, s80, 30 -; SI-NEXT: v_writelane_b32 v43, s7, 31 -; SI-NEXT: v_writelane_b32 v43, s72, 32 -; SI-NEXT: v_writelane_b32 v43, s26, 33 -; SI-NEXT: v_writelane_b32 v43, s41, 34 -; SI-NEXT: v_writelane_b32 v43, s14, 35 -; SI-NEXT: v_writelane_b32 v43, s69, 36 -; SI-NEXT: v_writelane_b32 v43, s71, 37 -; SI-NEXT: v_writelane_b32 v43, s70, 38 -; SI-NEXT: v_writelane_b32 v43, s68, 39 -; SI-NEXT: v_writelane_b32 v43, s60, 40 -; SI-NEXT: v_writelane_b32 v43, s62, 41 -; SI-NEXT: v_writelane_b32 v43, s11, 42 -; SI-NEXT: v_writelane_b32 v43, s10, 43 -; SI-NEXT: v_writelane_b32 v43, s58, 44 -; SI-NEXT: v_writelane_b32 v43, s66, 45 -; SI-NEXT: v_writelane_b32 v43, s29, 46 -; SI-NEXT: v_writelane_b32 v43, s28, 47 -; SI-NEXT: v_writelane_b32 v43, s27, 48 +; SI-NEXT: v_writelane_b32 v42, s81, 9 +; SI-NEXT: v_writelane_b32 v42, s84, 10 +; SI-NEXT: v_writelane_b32 v42, s85, 11 +; SI-NEXT: v_writelane_b32 v42, s86, 12 +; SI-NEXT: v_writelane_b32 v42, s87, 13 +; SI-NEXT: v_writelane_b32 v42, s99, 14 +; SI-NEXT: v_writelane_b32 v42, s8, 15 +; SI-NEXT: v_writelane_b32 v42, s12, 16 +; SI-NEXT: v_writelane_b32 v42, s56, 17 +; SI-NEXT: v_writelane_b32 v42, s98, 18 +; SI-NEXT: v_writelane_b32 v42, s96, 19 +; SI-NEXT: v_writelane_b32 v42, s7, 20 +; SI-NEXT: v_writelane_b32 v42, s74, 21 +; SI-NEXT: v_writelane_b32 v42, s83, 22 +; SI-NEXT: v_writelane_b32 v42, s15, 23 +; SI-NEXT: v_writelane_b32 v42, s9, 24 +; SI-NEXT: v_writelane_b32 v42, s97, 25 +; SI-NEXT: v_writelane_b32 v42, s82, 26 +; SI-NEXT: v_writelane_b32 v42, s80, 27 +; SI-NEXT: v_writelane_b32 v42, s73, 28 +; SI-NEXT: v_writelane_b32 v42, s77, 29 +; SI-NEXT: v_writelane_b32 v42, s41, 30 +; SI-NEXT: v_writelane_b32 v42, s14, 31 +; SI-NEXT: v_writelane_b32 v42, s69, 32 +; SI-NEXT: v_writelane_b32 v42, s71, 33 +; SI-NEXT: v_writelane_b32 v42, s70, 34 +; SI-NEXT: v_writelane_b32 v42, s29, 35 +; SI-NEXT: v_writelane_b32 v42, s28, 36 +; SI-NEXT: v_writelane_b32 v42, s23, 37 +; SI-NEXT: v_writelane_b32 v42, s68, 38 +; SI-NEXT: v_writelane_b32 v42, s21, 39 +; SI-NEXT: v_writelane_b32 v42, s47, 40 +; SI-NEXT: v_writelane_b32 v42, s25, 41 +; SI-NEXT: v_writelane_b32 v42, s57, 42 +; SI-NEXT: v_writelane_b32 v42, s89, 43 +; SI-NEXT: v_writelane_b32 v42, s11, 44 +; SI-NEXT: v_writelane_b32 v42, s44, 45 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readlane_b32 s4, v44, 3 +; SI-NEXT: v_readlane_b32 s4, v43, 6 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v44, 2 +; SI-NEXT: v_readlane_b32 s5, v43, 5 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v43, s4, 58 -; SI-NEXT: v_readlane_b32 s4, v44, 1 +; SI-NEXT: v_writelane_b32 v42, s4, 55 +; SI-NEXT: v_readlane_b32 s4, v43, 4 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v44, 0 +; SI-NEXT: v_readlane_b32 s5, v43, 3 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v43, s4, 59 +; SI-NEXT: v_writelane_b32 v42, s4, 56 ; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s73, 8 +; SI-NEXT: s_lshl_b32 s5, s13, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_mov_b32 s22, s6 -; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 60 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: v_writelane_b32 v42, s4, 57 ; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s57, 0xff +; SI-NEXT: v_writelane_b32 v42, s4, 58 +; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: v_readlane_b32 s4, v43, 2 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s25, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 61 +; SI-NEXT: s_lshl_b32 s6, s4, 24 ; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s47, 0xff +; SI-NEXT: v_writelane_b32 v42, s4, 59 +; SI-NEXT: s_and_b32 s5, s75, 0xff +; SI-NEXT: v_readlane_b32 s4, v43, 1 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s76, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 62 +; SI-NEXT: s_lshl_b32 s6, s4, 24 ; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s78, 0xff -; SI-NEXT: s_lshl_b32 s6, s74, 8 +; SI-NEXT: v_writelane_b32 v42, s4, 60 +; SI-NEXT: v_readlane_b32 s4, v43, 0 +; SI-NEXT: s_and_b32 s5, s4, 0xff +; SI-NEXT: s_lshl_b32 s6, s59, 8 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s16, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s17, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 63 ; SI-NEXT: s_or_b32 s4, s16, s6 -; SI-NEXT: s_and_b32 s6, s89, 0xff -; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: s_and_b32 s6, s78, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s16, s77, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 0 -; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: v_writelane_b32 v42, s6, 1 +; SI-NEXT: s_lshl_b32 s16, s76, 24 +; SI-NEXT: s_or_b32 s76, s16, s6 ; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s19, 24 -; SI-NEXT: s_or_b32 s76, s16, s6 -; SI-NEXT: s_and_b32 s6, s93, 0xff -; SI-NEXT: s_lshl_b32 s16, s92, 8 +; SI-NEXT: s_or_b32 s78, s16, s6 +; SI-NEXT: s_and_b32 s6, s94, 0xff +; SI-NEXT: s_lshl_b32 s16, s93, 8 ; SI-NEXT: s_or_b32 s6, s6, s16 -; SI-NEXT: s_and_b32 s16, s90, 0xff +; SI-NEXT: s_and_b32 s16, s91, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s92, 24 +; SI-NEXT: s_or_b32 s93, s17, s16 +; SI-NEXT: s_and_b32 s16, s30, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s91, 24 -; SI-NEXT: s_or_b32 s77, s17, s16 -; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s17, vcc_hi, 24 +; SI-NEXT: s_or_b32 s22, s17, s16 +; SI-NEXT: s_and_b32 s16, s95, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, vcc_lo, 24 -; SI-NEXT: s_or_b32 s25, s17, s16 -; SI-NEXT: s_and_b32 s16, s94, 0xff -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s95, 24 -; SI-NEXT: s_or_b32 s74, s17, s16 -; SI-NEXT: s_and_b32 s16, s35, 0xff -; SI-NEXT: s_lshl_b32 s17, s34, 8 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_writelane_b32 v42, s16, 46 +; SI-NEXT: s_and_b32 s16, s36, 0xff +; SI-NEXT: s_lshl_b32 s17, s35, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s30, 0xff +; SI-NEXT: s_and_b32 s17, s31, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s34, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_writelane_b32 v42, s17, 47 +; SI-NEXT: s_and_b32 s17, s48, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s31, 24 -; SI-NEXT: s_or_b32 s78, s18, s17 -; SI-NEXT: s_and_b32 s17, s39, 0xff +; SI-NEXT: s_lshl_b32 s18, s39, 24 +; SI-NEXT: s_or_b32 s31, s18, s17 +; SI-NEXT: s_and_b32 s17, s37, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s38, 24 -; SI-NEXT: s_mov_b32 s31, s88 -; SI-NEXT: s_or_b32 s88, s18, s17 -; SI-NEXT: s_and_b32 s17, s36, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s37, 24 -; SI-NEXT: s_or_b32 s89, s18, s17 -; SI-NEXT: s_and_b32 s17, s51, 0xff -; SI-NEXT: s_lshl_b32 s18, s50, 8 +; SI-NEXT: s_or_b32 s75, s18, s17 +; SI-NEXT: s_and_b32 s17, s52, 0xff +; SI-NEXT: s_lshl_b32 s18, s51, 8 ; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s48, 0xff +; SI-NEXT: s_and_b32 s18, s49, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s49, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_writelane_b32 v43, s18, 49 -; SI-NEXT: s_and_b32 s18, s55, 0xff +; SI-NEXT: s_lshl_b32 s19, s50, 24 +; SI-NEXT: s_or_b32 s88, s19, s18 +; SI-NEXT: s_and_b32 s18, s64, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s54, 24 -; SI-NEXT: s_mov_b32 s73, s79 -; SI-NEXT: s_or_b32 s79, s19, s18 -; SI-NEXT: s_and_b32 s18, s52, 0xff +; SI-NEXT: s_lshl_b32 s19, s55, 24 +; SI-NEXT: s_or_b32 s13, s19, s18 +; SI-NEXT: s_and_b32 s18, s53, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s53, 24 -; SI-NEXT: s_or_b32 s94, s19, s18 +; SI-NEXT: s_lshl_b32 s19, s54, 24 +; SI-NEXT: s_or_b32 s59, s19, s18 ; SI-NEXT: s_and_b32 s18, s84, 0xff -; SI-NEXT: s_lshl_b32 s19, s67, 8 +; SI-NEXT: s_lshl_b32 s19, s81, 8 ; SI-NEXT: s_or_b32 s18, s18, s19 -; SI-NEXT: s_and_b32 s19, s64, 0xff +; SI-NEXT: s_and_b32 s19, s65, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s65, 24 -; SI-NEXT: s_or_b32 s95, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s67, 24 +; SI-NEXT: s_or_b32 s94, s20, s19 ; SI-NEXT: s_and_b32 s19, s12, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s8, 24 -; SI-NEXT: s_or_b32 s8, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s99, 24 +; SI-NEXT: s_mov_b32 s30, s90 +; SI-NEXT: s_or_b32 s90, s20, s19 ; SI-NEXT: s_and_b32 s19, s85, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s86, 24 -; SI-NEXT: s_or_b32 s12, s20, s19 -; SI-NEXT: s_and_b32 s19, s80, 0xff +; SI-NEXT: s_or_b32 s64, s20, s19 +; SI-NEXT: s_and_b32 s19, s82, 0xff ; SI-NEXT: s_lshl_b32 s20, s9, 8 ; SI-NEXT: s_or_b32 vcc_lo, s19, s20 -; SI-NEXT: s_and_b32 s19, s44, 0xff +; SI-NEXT: s_and_b32 s19, s56, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s97, 24 -; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s98, 24 +; SI-NEXT: s_or_b32 s12, s20, s19 ; SI-NEXT: s_and_b32 s19, s41, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s96, 0xff +; SI-NEXT: s_lshl_b32 s20, s80, 24 +; SI-NEXT: s_mov_b32 s95, s10 +; SI-NEXT: s_or_b32 s10, s20, s19 +; SI-NEXT: s_and_b32 s19, s7, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s15, 24 -; SI-NEXT: v_writelane_b32 v43, s12, 50 -; SI-NEXT: s_or_b32 s12, s20, s19 -; SI-NEXT: s_and_b32 s19, s26, 0xff -; SI-NEXT: s_lshl_b32 s20, s82, 8 +; SI-NEXT: s_lshl_b32 s20, s96, 24 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s77, 0xff +; SI-NEXT: s_lshl_b32 s20, s15, 8 ; SI-NEXT: s_or_b32 vcc_hi, s19, s20 -; SI-NEXT: s_and_b32 s19, s99, 0xff +; SI-NEXT: s_and_b32 s19, s8, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s87, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 51 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s72, 0xff +; SI-NEXT: s_or_b32 s8, s20, s19 +; SI-NEXT: s_and_b32 s19, s73, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s81, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 52 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s98, 0xff +; SI-NEXT: s_lshl_b32 s20, s97, 24 +; SI-NEXT: v_writelane_b32 v42, s7, 48 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s74, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s83, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 54 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s62, 0xff -; SI-NEXT: s_lshl_b32 s20, s60, 8 +; SI-NEXT: v_writelane_b32 v42, s7, 50 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s23, 0xff +; SI-NEXT: s_lshl_b32 s20, s28, 8 ; SI-NEXT: s_or_b32 s84, s19, s20 ; SI-NEXT: s_and_b32 s19, s71, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s70, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 53 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_writelane_b32 v42, s7, 49 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s68, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s68, 24 -; SI-NEXT: s_or_b32 s57, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s29, 24 +; SI-NEXT: v_writelane_b32 v42, s7, 51 +; SI-NEXT: s_or_b32 s7, s20, s19 ; SI-NEXT: s_and_b32 s19, s14, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s69, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 55 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s29, 0xff -; SI-NEXT: s_lshl_b32 s20, s66, 8 +; SI-NEXT: s_or_b32 s14, s20, s19 +; SI-NEXT: s_and_b32 s19, s57, 0xff +; SI-NEXT: s_lshl_b32 s20, s25, 8 ; SI-NEXT: s_or_b32 s85, s19, s20 -; SI-NEXT: s_and_b32 s19, s10, 0xff +; SI-NEXT: s_and_b32 s19, s21, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s58, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 56 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s27, 0xff +; SI-NEXT: s_lshl_b32 s20, s47, 24 +; SI-NEXT: v_writelane_b32 v42, s14, 52 +; SI-NEXT: s_mov_b32 s57, s7 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_writelane_b32 v42, s7, 53 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s28, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 57 +; SI-NEXT: s_lshl_b32 s20, s89, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 33 ; SI-NEXT: s_or_b32 s23, s20, s19 -; SI-NEXT: s_and_b32 s19, s24, 0xff -; SI-NEXT: v_readlane_b32 s9, v44, 33 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 32 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v44, 32 -; SI-NEXT: s_or_b32 s10, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v44, 31 -; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: v_readlane_b32 s9, v44, 30 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: v_writelane_b32 v42, s7, 54 +; SI-NEXT: v_readlane_b32 s7, v43, 31 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 30 +; SI-NEXT: s_lshl_b32 s20, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 29 ; SI-NEXT: s_or_b32 s86, s19, s20 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v44, 29 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 28 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v44, 28 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 27 ; SI-NEXT: s_or_b32 s47, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v44, 27 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 26 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 26 -; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s11, v43, 25 +; SI-NEXT: s_or_b32 s7, s20, s19 ; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 25 +; SI-NEXT: v_readlane_b32 s11, v43, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s11, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 24 -; SI-NEXT: s_or_b32 s24, s20, s19 -; SI-NEXT: s_mov_b32 s92, s11 +; SI-NEXT: v_readlane_b32 s11, v43, 23 +; SI-NEXT: s_or_b32 s14, s20, s19 +; SI-NEXT: s_mov_b32 s65, s11 ; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 23 -; SI-NEXT: s_mov_b32 s36, s11 +; SI-NEXT: v_readlane_b32 s11, v43, 22 ; SI-NEXT: s_lshl_b32 s20, s11, 8 -; SI-NEXT: v_readlane_b32 s11, v44, 22 ; SI-NEXT: s_or_b32 s87, s19, s20 -; SI-NEXT: s_mov_b32 s62, s11 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 21 +; SI-NEXT: s_and_b32 s19, s44, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s30, s11 -; SI-NEXT: s_lshl_b32 s20, s11, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 20 -; SI-NEXT: s_or_b32 s58, s20, s19 -; SI-NEXT: s_mov_b32 s91, s11 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 19 +; SI-NEXT: s_lshl_b32 s20, s66, 24 +; SI-NEXT: s_or_b32 s15, s20, s19 +; SI-NEXT: s_and_b32 s19, s79, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s35, s11 -; SI-NEXT: s_lshl_b32 s20, s11, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 18 -; SI-NEXT: s_mov_b32 s4, s46 -; SI-NEXT: s_or_b32 s46, s20, s19 -; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: s_lshl_b32 s20, s72, 24 +; SI-NEXT: s_mov_b32 s37, s11 +; SI-NEXT: s_or_b32 s11, s20, s19 +; SI-NEXT: s_and_b32 s19, s27, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s73, 24 -; SI-NEXT: s_mov_b32 s52, s73 -; SI-NEXT: s_or_b32 s73, s20, s19 -; SI-NEXT: s_and_b32 s19, s31, 0xff -; SI-NEXT: s_lshl_b32 s20, s45, 8 +; SI-NEXT: s_lshl_b32 s20, s58, 24 +; SI-NEXT: s_mov_b32 s36, s72 +; SI-NEXT: s_or_b32 s72, s20, s19 +; SI-NEXT: s_and_b32 s19, s26, 0xff +; SI-NEXT: s_lshl_b32 s20, s95, 8 +; SI-NEXT: v_readlane_b32 s9, v43, 21 +; SI-NEXT: s_mov_b32 s91, s26 ; SI-NEXT: s_or_b32 s26, s19, s20 -; SI-NEXT: s_and_b32 s19, s13, 0xff +; SI-NEXT: s_and_b32 s19, s60, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s42, 24 +; SI-NEXT: s_lshl_b32 s20, s9, 24 ; SI-NEXT: s_or_b32 s67, s20, s19 -; SI-NEXT: s_and_b32 s19, s4, 0xff +; SI-NEXT: s_and_b32 s19, s24, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s43, 24 -; SI-NEXT: s_mov_b32 s53, s42 -; SI-NEXT: s_or_b32 s42, s20, s19 -; SI-NEXT: s_and_b32 s19, s56, 0xff +; SI-NEXT: s_lshl_b32 s20, s63, 24 +; SI-NEXT: s_mov_b32 s49, s66 +; SI-NEXT: s_or_b32 s66, s20, s19 +; SI-NEXT: s_and_b32 s19, s30, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s59, 24 +; SI-NEXT: s_lshl_b32 s20, s61, 24 ; SI-NEXT: s_or_b32 s68, s20, s19 -; SI-NEXT: s_and_b32 s19, s63, 0xff -; SI-NEXT: s_lshl_b32 s20, s61, 8 -; SI-NEXT: v_readlane_b32 s93, v44, 17 +; SI-NEXT: s_and_b32 s19, s62, 0xff +; SI-NEXT: s_lshl_b32 s20, s42, 8 +; SI-NEXT: s_mov_b32 s53, s27 ; SI-NEXT: s_or_b32 s27, s19, s20 ; SI-NEXT: s_and_b32 s19, s40, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s93, 24 +; SI-NEXT: s_lshl_b32 s20, s43, 24 ; SI-NEXT: s_or_b32 s70, s20, s19 -; SI-NEXT: s_and_b32 s19, s21, 0xff -; SI-NEXT: s_mov_b32 s51, s59 -; SI-NEXT: s_mov_b32 s59, s7 +; SI-NEXT: s_and_b32 s19, s46, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s75, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 16 -; SI-NEXT: s_mov_b32 s48, s56 -; SI-NEXT: s_mov_b32 s56, s10 +; SI-NEXT: s_mov_b32 s48, s45 +; SI-NEXT: s_lshl_b32 s20, s45, 24 +; SI-NEXT: s_mov_b32 s45, s7 +; SI-NEXT: v_readlane_b32 s7, v43, 20 ; SI-NEXT: s_or_b32 s69, s20, s19 -; SI-NEXT: s_mov_b32 s10, s7 +; SI-NEXT: s_mov_b32 s71, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 15 +; SI-NEXT: v_readlane_b32 s7, v43, 19 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s71, s7 +; SI-NEXT: s_mov_b32 s77, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 14 -; SI-NEXT: s_mov_b32 s39, s75 -; SI-NEXT: s_mov_b32 s75, s94 -; SI-NEXT: s_or_b32 s94, s20, s19 +; SI-NEXT: v_readlane_b32 s7, v43, 18 +; SI-NEXT: s_mov_b32 s50, s24 +; SI-NEXT: s_or_b32 s24, s20, s19 ; SI-NEXT: s_mov_b32 s41, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 13 +; SI-NEXT: v_readlane_b32 s7, v43, 17 +; SI-NEXT: s_mov_b32 s81, s46 +; SI-NEXT: s_mov_b32 s46, s14 ; SI-NEXT: s_mov_b32 s14, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 12 +; SI-NEXT: v_readlane_b32 s7, v43, 16 ; SI-NEXT: s_or_b32 s29, s19, s20 -; SI-NEXT: s_mov_b32 s81, s7 +; SI-NEXT: s_mov_b32 s56, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 11 -; SI-NEXT: s_mov_b32 s55, s45 -; SI-NEXT: s_mov_b32 s45, s9 +; SI-NEXT: v_readlane_b32 s7, v43, 15 +; SI-NEXT: s_mov_b32 s52, s9 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 10 -; SI-NEXT: s_mov_b32 s38, s11 -; SI-NEXT: s_or_b32 s11, s20, s19 -; SI-NEXT: s_mov_b32 s72, s7 +; SI-NEXT: v_readlane_b32 s7, v43, 14 +; SI-NEXT: s_or_b32 s25, s20, s19 +; SI-NEXT: s_mov_b32 s73, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 9 +; SI-NEXT: v_readlane_b32 s7, v43, 13 +; SI-NEXT: s_mov_b32 s39, s58 +; SI-NEXT: s_mov_b32 s58, s15 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s82, s7 +; SI-NEXT: s_mov_b32 s15, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 12 ; SI-NEXT: s_or_b32 s80, s20, s19 ; SI-NEXT: s_mov_b32 s83, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 7 +; SI-NEXT: v_readlane_b32 s7, v43, 11 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s96, s7 +; SI-NEXT: s_mov_b32 s97, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 6 -; SI-NEXT: s_mov_b32 s90, s31 -; SI-NEXT: s_or_b32 s31, s20, s19 -; SI-NEXT: s_mov_b32 s98, s7 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 5 +; SI-NEXT: v_readlane_b32 s7, v43, 10 +; SI-NEXT: s_or_b32 s82, s20, s19 ; SI-NEXT: s_mov_b32 s44, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 9 +; SI-NEXT: s_mov_b32 s55, s63 +; SI-NEXT: s_mov_b32 s63, s8 ; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 4 -; SI-NEXT: s_mov_b32 s37, s43 -; SI-NEXT: s_mov_b32 s43, s93 -; SI-NEXT: s_mov_b32 s93, s21 +; SI-NEXT: v_readlane_b32 s8, v43, 8 +; SI-NEXT: s_mov_b32 s35, s62 +; SI-NEXT: s_mov_b32 s62, s12 ; SI-NEXT: s_or_b32 s21, s19, s20 -; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: s_mov_b32 s34, s4 +; SI-NEXT: s_and_b32 s19, s8, 0xff +; SI-NEXT: v_readlane_b32 s12, v43, 7 +; SI-NEXT: s_mov_b32 s38, s30 +; SI-NEXT: s_mov_b32 s30, s42 +; SI-NEXT: s_mov_b32 s42, s43 +; SI-NEXT: s_mov_b32 s43, s11 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s8 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s22, 24 -; SI-NEXT: v_readlane_b32 s4, v43, 60 -; SI-NEXT: s_mov_b32 s54, s13 -; SI-NEXT: s_mov_b32 s13, s12 -; SI-NEXT: s_mov_b32 s50, s63 -; SI-NEXT: s_mov_b32 s63, s95 -; SI-NEXT: s_mov_b32 s49, s61 -; SI-NEXT: s_mov_b32 s61, s8 -; SI-NEXT: s_mov_b32 s60, s40 -; SI-NEXT: s_mov_b32 s12, s7 -; SI-NEXT: s_mov_b32 s7, s22 -; SI-NEXT: s_or_b32 s15, s20, s19 -; SI-NEXT: s_lshl_b32 s20, s4, 16 -; SI-NEXT: s_lshl_b32 s95, s5, 16 -; SI-NEXT: s_lshl_b32 s22, s6, 16 +; SI-NEXT: s_mov_b32 s8, s12 +; SI-NEXT: s_lshl_b32 s20, s12, 24 +; SI-NEXT: v_readlane_b32 s12, v42, 57 +; SI-NEXT: s_mov_b32 s92, s79 +; SI-NEXT: s_mov_b32 s34, s95 +; SI-NEXT: s_mov_b32 s95, s60 +; SI-NEXT: s_mov_b32 s54, s61 +; SI-NEXT: s_mov_b32 s61, s94 +; SI-NEXT: s_mov_b32 s51, s40 +; SI-NEXT: s_or_b32 s28, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s12, 16 +; SI-NEXT: s_lshl_b32 s74, s5, 16 +; SI-NEXT: s_lshl_b32 s94, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s19, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s17, vcc_lo, 16 ; SI-NEXT: s_lshl_b32 s6, vcc_hi, 16 ; SI-NEXT: s_lshl_b32 s99, s84, 16 -; SI-NEXT: s_lshl_b32 s8, s85, 16 -; SI-NEXT: s_lshl_b32 s97, s86, 16 -; SI-NEXT: s_lshl_b32 s28, s87, 16 +; SI-NEXT: s_lshl_b32 s98, s85, 16 +; SI-NEXT: s_lshl_b32 s12, s86, 16 +; SI-NEXT: s_lshl_b32 s96, s87, 16 ; SI-NEXT: s_lshl_b32 s87, s26, 16 -; SI-NEXT: v_readlane_b32 s26, v43, 58 +; SI-NEXT: v_readlane_b32 s26, v42, 55 +; SI-NEXT: s_mov_b32 s60, s64 +; SI-NEXT: v_readlane_b32 s64, v42, 59 ; SI-NEXT: s_lshl_b32 s86, s27, 16 -; SI-NEXT: v_readlane_b32 s27, v43, 59 -; SI-NEXT: v_readlane_b32 s66, v43, 63 +; SI-NEXT: v_readlane_b32 s27, v42, 56 +; SI-NEXT: s_mov_b32 s79, s93 +; SI-NEXT: s_mov_b32 s93, s72 +; SI-NEXT: s_mov_b32 s72, s78 +; SI-NEXT: s_mov_b32 s78, s66 ; SI-NEXT: s_lshl_b32 s85, s29, 16 -; SI-NEXT: v_readlane_b32 s29, v43, 62 -; SI-NEXT: v_readlane_b32 s65, v43, 61 -; SI-NEXT: v_readlane_b32 s64, v42, 0 +; SI-NEXT: v_readlane_b32 s29, v42, 58 ; SI-NEXT: s_lshl_b32 s84, s21, 16 -; SI-NEXT: v_readlane_b32 s21, v42, 1 +; SI-NEXT: s_mov_b32 s21, s76 +; SI-NEXT: v_readlane_b32 s76, v42, 60 +; SI-NEXT: s_mov_b32 s66, s4 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true -; SI-NEXT: s_add_i32 s4, s98, 3 +; SI-NEXT: s_add_i32 s4, s44, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: s_add_i32 s6, s12, 3 +; SI-NEXT: s_lshl_b32 s5, s11, 8 +; SI-NEXT: s_add_i32 s6, s7, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 24 +; SI-NEXT: s_lshl_b32 s5, s8, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s5, s72, 3 +; SI-NEXT: s_add_i32 s5, s73, 3 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: s_lshl_b32 s6, s82, 8 +; SI-NEXT: s_lshl_b32 s6, s15, 8 ; SI-NEXT: s_add_i32 s16, s83, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s6, s96, 24 +; SI-NEXT: s_lshl_b32 s6, s97, 24 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_or_b32 s6, s6, s16 @@ -155276,7 +154751,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_add_i32 s6, s41, 3 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s16, s14, 8 -; SI-NEXT: s_add_i32 s17, s81, 3 +; SI-NEXT: s_add_i32 s17, s56, 3 ; SI-NEXT: s_or_b32 s6, s16, s6 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_lshl_b32 s16, s9, 24 @@ -155285,155 +154760,156 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: s_add_i32 s16, s93, 3 +; SI-NEXT: s_add_i32 s16, s81, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s39, 8 -; SI-NEXT: s_add_i32 s18, s10, 3 +; SI-NEXT: s_lshl_b32 s17, s48, 8 +; SI-NEXT: s_add_i32 s18, s71, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s17, s71, 24 +; SI-NEXT: s_lshl_b32 s17, s77, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s17, s50, 3 +; SI-NEXT: s_add_i32 s17, s35, 3 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: s_lshl_b32 s18, s49, 8 -; SI-NEXT: s_add_i32 s19, s60, 3 +; SI-NEXT: s_lshl_b32 s18, s30, 8 +; SI-NEXT: s_add_i32 s19, s51, 3 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s18, s43, 24 +; SI-NEXT: s_lshl_b32 s18, s42, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_add_i32 s18, s34, 3 +; SI-NEXT: s_add_i32 s18, s50, 3 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s37, 8 -; SI-NEXT: s_add_i32 s20, s48, 3 +; SI-NEXT: s_lshl_b32 s19, s55, 8 +; SI-NEXT: s_add_i32 s20, s38, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s19, s51, 24 +; SI-NEXT: s_lshl_b32 s19, s54, 24 ; SI-NEXT: s_lshl_b32 s20, s20, 16 ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_add_i32 s19, s90, 3 +; SI-NEXT: s_add_i32 s19, s91, 3 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s20, s55, 8 -; SI-NEXT: s_add_i32 s22, s54, 3 +; SI-NEXT: s_lshl_b32 s20, s34, 8 +; SI-NEXT: s_add_i32 s22, s95, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s20, s53, 24 +; SI-NEXT: s_lshl_b32 s20, s52, 24 ; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: s_and_b32 s19, s19, 0xffff ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: s_add_i32 s20, s91, 3 +; SI-NEXT: s_add_i32 s20, s92, 3 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s22, s35, 8 -; SI-NEXT: s_add_i32 s23, s38, 3 +; SI-NEXT: s_lshl_b32 s22, s36, 8 +; SI-NEXT: s_add_i32 s23, s53, 3 ; SI-NEXT: s_or_b32 s20, s22, s20 ; SI-NEXT: s_and_b32 s23, s23, 0xff -; SI-NEXT: s_lshl_b32 s22, s52, 24 +; SI-NEXT: s_lshl_b32 s22, s39, 24 ; SI-NEXT: s_lshl_b32 s23, s23, 16 ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: s_add_i32 s22, s92, 3 +; SI-NEXT: s_add_i32 s22, s65, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 45 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s23, s36, 8 -; SI-NEXT: s_add_i32 s60, s62, 3 +; SI-NEXT: s_lshl_b32 s23, s37, 8 +; SI-NEXT: s_add_i32 s60, s7, 3 ; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: s_and_b32 s60, s60, 0xff -; SI-NEXT: s_lshl_b32 s23, s30, 24 +; SI-NEXT: s_lshl_b32 s23, s49, 24 ; SI-NEXT: s_lshl_b32 s60, s60, 16 ; SI-NEXT: s_addk_i32 s22, 0x300 ; SI-NEXT: s_or_b32 s23, s23, s60 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: v_readlane_b32 s7, v44, 28 +; SI-NEXT: v_readlane_b32 s7, v43, 27 ; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: s_add_i32 s23, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v44, 27 +; SI-NEXT: v_readlane_b32 s7, v43, 26 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_lshl_b32 s60, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 25 +; SI-NEXT: v_readlane_b32 s7, v43, 24 ; SI-NEXT: s_or_b32 s23, s60, s23 ; SI-NEXT: s_lshl_b32 s60, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 26 +; SI-NEXT: v_readlane_b32 s7, v43, 25 ; SI-NEXT: s_add_i32 s61, s7, 3 ; SI-NEXT: s_and_b32 s61, s61, 0xff ; SI-NEXT: s_lshl_b32 s61, s61, 16 ; SI-NEXT: s_addk_i32 s23, 0x300 ; SI-NEXT: s_or_b32 s60, s60, s61 ; SI-NEXT: s_and_b32 s23, s23, 0xffff -; SI-NEXT: v_readlane_b32 s7, v44, 32 +; SI-NEXT: v_readlane_b32 s7, v43, 31 ; SI-NEXT: s_or_b32 s23, s60, s23 ; SI-NEXT: s_add_i32 s60, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v44, 31 +; SI-NEXT: v_readlane_b32 s7, v43, 30 ; SI-NEXT: s_and_b32 s60, s60, 0xff ; SI-NEXT: s_lshl_b32 s61, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 29 +; SI-NEXT: v_readlane_b32 s7, v43, 28 ; SI-NEXT: s_or_b32 s60, s61, s60 ; SI-NEXT: s_lshl_b32 s61, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 30 +; SI-NEXT: v_readlane_b32 s7, v43, 29 ; SI-NEXT: s_add_i32 s62, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 48 +; SI-NEXT: v_readlane_b32 s7, v42, 44 ; SI-NEXT: s_and_b32 s62, s62, 0xff ; SI-NEXT: s_add_i32 s59, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 47 +; SI-NEXT: v_readlane_b32 s7, v42, 43 ; SI-NEXT: s_lshl_b32 s62, s62, 16 ; SI-NEXT: s_addk_i32 s60, 0x300 ; SI-NEXT: s_and_b32 s59, s59, 0xff ; SI-NEXT: s_lshl_b32 s58, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 33 +; SI-NEXT: v_readlane_b32 s7, v43, 32 ; SI-NEXT: s_or_b32 s61, s61, s62 ; SI-NEXT: s_and_b32 s60, s60, 0xffff ; SI-NEXT: s_or_b32 s58, s58, s59 ; SI-NEXT: s_lshl_b32 s59, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 44 +; SI-NEXT: v_readlane_b32 s7, v43, 33 ; SI-NEXT: s_or_b32 s60, s61, s60 ; SI-NEXT: s_add_i32 s61, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 46 +; SI-NEXT: v_readlane_b32 s7, v42, 42 ; SI-NEXT: s_add_i32 s57, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 45 +; SI-NEXT: v_readlane_b32 s7, v42, 41 ; SI-NEXT: s_lshl_b32 s56, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 44 +; SI-NEXT: v_readlane_b32 s7, v42, 40 ; SI-NEXT: s_lshl_b32 s47, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 43 +; SI-NEXT: v_readlane_b32 s7, v42, 39 ; SI-NEXT: s_add_i32 s46, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 42 +; SI-NEXT: v_readlane_b32 s7, v42, 38 ; SI-NEXT: s_add_i32 s45, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 39 +; SI-NEXT: v_readlane_b32 s7, v42, 35 ; SI-NEXT: s_lshl_b32 s42, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 36 +; SI-NEXT: v_readlane_b32 s7, v42, 32 ; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 35 +; SI-NEXT: v_readlane_b32 s7, v42, 31 ; SI-NEXT: s_and_b32 s45, s45, 0xff ; SI-NEXT: s_add_i32 s14, s7, 3 ; SI-NEXT: s_or_b32 s42, s42, s45 ; SI-NEXT: s_and_b32 s14, s14, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_addk_i32 s42, 0x300 -; SI-NEXT: v_readlane_b32 s7, v43, 41 +; SI-NEXT: v_readlane_b32 s7, v42, 37 ; SI-NEXT: s_and_b32 s57, s57, 0xff ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s15, s42, 0xffff ; SI-NEXT: s_add_i32 s44, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 40 +; SI-NEXT: v_readlane_b32 s7, v42, 36 ; SI-NEXT: s_or_b32 s56, s56, s57 ; SI-NEXT: s_or_b32 s57, s14, s15 ; SI-NEXT: s_and_b32 s14, s44, 0xff ; SI-NEXT: s_lshl_b32 s15, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 38 +; SI-NEXT: v_readlane_b32 s7, v42, 34 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 37 +; SI-NEXT: v_readlane_b32 s7, v42, 33 ; SI-NEXT: s_add_i32 s40, s7, 3 ; SI-NEXT: s_and_b32 s61, s61, 0xff ; SI-NEXT: s_and_b32 s40, s40, 0xff @@ -155448,15 +154924,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s58, s59, s58 ; SI-NEXT: s_or_b32 s59, s15, s14 ; SI-NEXT: s_add_i32 s14, s6, 0x3000000 -; SI-NEXT: v_readlane_b32 s6, v43, 32 +; SI-NEXT: v_readlane_b32 s6, v42, 28 ; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 29 +; SI-NEXT: v_readlane_b32 s7, v42, 25 ; SI-NEXT: s_and_b32 s6, s11, 0xff ; SI-NEXT: s_lshl_b32 s8, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 26 +; SI-NEXT: v_readlane_b32 s7, v42, 22 ; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: s_lshl_b32 s8, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 25 +; SI-NEXT: v_readlane_b32 s7, v42, 21 ; SI-NEXT: s_add_i32 s24, s7, 3 ; SI-NEXT: s_and_b32 s11, s24, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 @@ -155464,47 +154940,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s8, s11 ; SI-NEXT: s_or_b32 s8, s8, s6 -; SI-NEXT: v_readlane_b32 s6, v43, 33 +; SI-NEXT: v_readlane_b32 s6, v42, 29 ; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 27 +; SI-NEXT: v_readlane_b32 s7, v42, 23 ; SI-NEXT: s_and_b32 s6, s12, 0xff ; SI-NEXT: s_lshl_b32 s11, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 17 +; SI-NEXT: v_readlane_b32 s7, v42, 13 ; SI-NEXT: s_or_b32 s6, s11, s6 ; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 19 +; SI-NEXT: v_readlane_b32 s7, v42, 15 ; SI-NEXT: s_add_i32 s12, s7, 3 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: v_readlane_b32 s7, v43, 34 +; SI-NEXT: v_readlane_b32 s7, v42, 30 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_add_i32 s13, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 31 +; SI-NEXT: v_readlane_b32 s7, v42, 27 ; SI-NEXT: s_or_b32 s6, s11, s6 ; SI-NEXT: s_and_b32 s11, s13, 0xff ; SI-NEXT: s_lshl_b32 s10, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 23 +; SI-NEXT: v_readlane_b32 s7, v42, 19 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 20 ; SI-NEXT: s_add_i32 s25, s7, 3 ; SI-NEXT: s_and_b32 s12, s25, 0xff ; SI-NEXT: s_addk_i32 s10, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 -; SI-NEXT: v_readlane_b32 s7, v43, 30 +; SI-NEXT: v_readlane_b32 s7, v42, 26 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_add_i32 s9, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 28 -; SI-NEXT: v_readlane_b32 s11, v43, 21 +; SI-NEXT: v_readlane_b32 s7, v42, 24 +; SI-NEXT: v_readlane_b32 s11, v42, 17 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_readlane_b32 s9, v43, 22 +; SI-NEXT: v_readlane_b32 s9, v42, 18 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 ; SI-NEXT: s_lshl_b32 s9, s9, 24 @@ -155512,15 +154988,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s9, s9, s11 ; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: v_readlane_b32 s9, v43, 20 +; SI-NEXT: v_readlane_b32 s9, v42, 16 ; SI-NEXT: s_add_i32 s21, s9, 3 -; SI-NEXT: v_readlane_b32 s11, v43, 18 -; SI-NEXT: v_readlane_b32 s12, v43, 15 +; SI-NEXT: v_readlane_b32 s11, v42, 14 +; SI-NEXT: v_readlane_b32 s12, v42, 11 ; SI-NEXT: s_and_b32 s9, s21, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v43, 16 +; SI-NEXT: v_readlane_b32 s11, v42, 12 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_addk_i32 s9, 0x300 ; SI-NEXT: s_lshl_b32 s11, s11, 24 @@ -155528,15 +155004,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v43, 14 +; SI-NEXT: v_readlane_b32 s11, v42, 10 ; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: v_readlane_b32 s12, v43, 13 -; SI-NEXT: v_readlane_b32 s13, v43, 11 +; SI-NEXT: v_readlane_b32 s12, v42, 9 +; SI-NEXT: v_readlane_b32 s13, v42, 7 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v43, 12 +; SI-NEXT: v_readlane_b32 s12, v42, 8 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_addk_i32 s11, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 24 @@ -155544,16 +155020,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v43, 10 +; SI-NEXT: v_readlane_b32 s12, v42, 6 ; SI-NEXT: s_add_i32 s15, s16, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: v_readlane_b32 s13, v43, 9 -; SI-NEXT: v_readlane_b32 s16, v43, 7 +; SI-NEXT: v_readlane_b32 s13, v42, 5 +; SI-NEXT: v_readlane_b32 s16, v42, 3 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_lshl_b32 s13, s13, 8 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v43, 8 +; SI-NEXT: v_readlane_b32 s13, v42, 4 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_addk_i32 s12, 0x300 ; SI-NEXT: s_lshl_b32 s13, s13, 24 @@ -155561,16 +155037,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s13, s13, s16 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v43, 6 +; SI-NEXT: v_readlane_b32 s13, v42, 2 ; SI-NEXT: s_add_i32 s40, s17, 0x3000000 ; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_readlane_b32 s16, v43, 5 -; SI-NEXT: v_readlane_b32 s17, v43, 3 +; SI-NEXT: v_readlane_b32 s16, v42, 1 +; SI-NEXT: v_readlane_b32 s17, v43, 63 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v43, 4 +; SI-NEXT: v_readlane_b32 s16, v42, 0 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_addk_i32 s13, 0x300 ; SI-NEXT: s_lshl_b32 s16, s16, 24 @@ -155578,16 +155054,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v43, 2 +; SI-NEXT: v_readlane_b32 s16, v43, 62 ; SI-NEXT: s_add_i32 s41, s18, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 1 -; SI-NEXT: v_readlane_b32 s18, v44, 63 +; SI-NEXT: v_readlane_b32 s17, v43, 61 +; SI-NEXT: v_readlane_b32 s18, v43, 59 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 0 +; SI-NEXT: v_readlane_b32 s17, v43, 60 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -155596,16 +155072,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s17, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v44, 62 +; SI-NEXT: v_readlane_b32 s16, v43, 58 ; SI-NEXT: s_add_i32 s42, s19, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s18, v44, 61 -; SI-NEXT: v_readlane_b32 s19, v44, 59 +; SI-NEXT: v_readlane_b32 s18, v43, 57 +; SI-NEXT: v_readlane_b32 s19, v43, 55 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v44, 60 +; SI-NEXT: v_readlane_b32 s18, v43, 56 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s18, s18, 24 @@ -155613,16 +155089,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v44, 58 +; SI-NEXT: v_readlane_b32 s18, v43, 54 ; SI-NEXT: s_add_i32 s43, s20, 0x3000000 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s19, v44, 57 -; SI-NEXT: v_readlane_b32 s20, v44, 55 +; SI-NEXT: v_readlane_b32 s19, v43, 53 +; SI-NEXT: v_readlane_b32 s20, v43, 51 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 8 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v44, 56 +; SI-NEXT: v_readlane_b32 s19, v43, 52 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_lshl_b32 s19, s19, 24 @@ -155630,15 +155106,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v44, 54 +; SI-NEXT: v_readlane_b32 s19, v43, 50 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_readlane_b32 s20, v44, 53 -; SI-NEXT: v_readlane_b32 s21, v44, 51 +; SI-NEXT: v_readlane_b32 s20, v43, 49 +; SI-NEXT: v_readlane_b32 s21, v43, 47 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 8 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v44, 52 +; SI-NEXT: v_readlane_b32 s20, v43, 48 ; SI-NEXT: s_and_b32 s21, s21, 0xff ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_lshl_b32 s20, s20, 24 @@ -155646,16 +155122,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s19, 0xffff ; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v44, 50 +; SI-NEXT: v_readlane_b32 s20, v43, 46 ; SI-NEXT: s_add_i32 s44, s22, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s21, v44, 49 -; SI-NEXT: v_readlane_b32 s22, v44, 47 +; SI-NEXT: v_readlane_b32 s21, v43, 45 +; SI-NEXT: v_readlane_b32 s22, v43, 43 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s21, s21, 8 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: v_readlane_b32 s21, v44, 48 +; SI-NEXT: v_readlane_b32 s21, v43, 44 ; SI-NEXT: s_and_b32 s22, s22, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s21, s21, 24 @@ -155664,16 +155140,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s21, s21, s22 ; SI-NEXT: s_or_b32 s20, s21, s20 ; SI-NEXT: s_add_i32 s21, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v44, 43 +; SI-NEXT: v_readlane_b32 s20, v43, 0 ; SI-NEXT: s_add_i32 s45, s23, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s22, v44, 42 -; SI-NEXT: v_readlane_b32 s23, v44, 45 +; SI-NEXT: v_readlane_b32 s22, v43, 40 +; SI-NEXT: v_readlane_b32 s23, v43, 41 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s22, s22, 8 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: v_readlane_b32 s22, v44, 46 +; SI-NEXT: v_readlane_b32 s22, v43, 42 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s22, s22, 24 @@ -155682,15 +155158,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_or_b32 s20, s22, s20 ; SI-NEXT: s_add_i32 s22, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v44, 41 +; SI-NEXT: v_readlane_b32 s20, v43, 39 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s23, v44, 40 -; SI-NEXT: v_readlane_b32 s24, v44, 38 +; SI-NEXT: v_readlane_b32 s23, v43, 2 +; SI-NEXT: v_readlane_b32 s24, v43, 38 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s23, s23, 8 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s20, s23, s20 -; SI-NEXT: v_readlane_b32 s23, v44, 39 +; SI-NEXT: v_readlane_b32 s23, v43, 1 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s23, s23, 24 @@ -155699,261 +155175,260 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s23, s23, s24 ; SI-NEXT: s_or_b32 s20, s23, s20 ; SI-NEXT: s_add_i32 s23, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v44, 37 +; SI-NEXT: v_readlane_b32 s20, v43, 37 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v44, 36 -; SI-NEXT: v_readlane_b32 s25, v44, 34 +; SI-NEXT: v_readlane_b32 s24, v43, 36 +; SI-NEXT: v_readlane_b32 s25, v43, 34 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v44, 35 +; SI-NEXT: v_readlane_b32 s24, v43, 35 ; SI-NEXT: s_and_b32 s25, s25, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s24, s24, 24 ; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_add_i32 s18, s18, 0x3000000 +; SI-NEXT: s_add_i32 s19, s19, 0x3000000 ; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_add_i32 s16, s16, 0x3000000 ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v44, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_readlane_b32 s25, v44, 2 -; SI-NEXT: v_readlane_b32 s26, v44, 1 -; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: s_lshl_b32 s25, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_readlane_b32 s25, v44, 0 -; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: s_add_i32 s13, s13, 0x3000000 -; SI-NEXT: s_addk_i32 s24, 0x300 -; SI-NEXT: s_lshl_b32 s25, s25, 24 -; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: s_add_i32 s9, s9, 0x3000000 -; SI-NEXT: s_add_i32 s11, s11, 0x3000000 -; SI-NEXT: s_add_i32 s18, s18, 0x3000000 -; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_or_b32 s25, s25, s26 -; SI-NEXT: s_and_b32 s89, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s17, 16 -; SI-NEXT: s_and_b32 s17, s13, 0xffff0000 -; SI-NEXT: s_add_i32 s7, s7, 0x3000000 -; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: s_and_b32 s74, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s18, 16 -; SI-NEXT: v_writelane_b32 v43, s17, 49 -; SI-NEXT: s_and_b32 s63, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s18, s11, 16 -; SI-NEXT: s_and_b32 s11, s9, 0xffff0000 +; SI-NEXT: v_readlane_b32 s24, v43, 6 +; SI-NEXT: s_and_b32 s79, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s19, 16 +; SI-NEXT: s_and_b32 s19, s18, 0xffff0000 ; SI-NEXT: s_and_b32 s46, s46, 0xff -; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s11, 50 -; SI-NEXT: s_lshl_b32 s61, s9, 16 -; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: v_readlane_b32 s25, v43, 5 +; SI-NEXT: v_readlane_b32 s26, v43, 4 +; SI-NEXT: s_and_b32 s66, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s74, s22, 16 +; SI-NEXT: v_writelane_b32 v42, s19, 46 +; SI-NEXT: s_lshl_b32 s22, s18, 16 +; SI-NEXT: s_and_b32 s18, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s46, s46, 16 ; SI-NEXT: s_addk_i32 s56, 0x300 ; SI-NEXT: s_add_i32 s8, s8, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s9, 51 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_lshl_b32 s25, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_writelane_b32 v42, s18, 47 +; SI-NEXT: s_and_b32 s75, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s31, s17, 16 +; SI-NEXT: s_and_b32 s62, s7, 0xffff0000 ; SI-NEXT: s_lshl_b32 s17, s7, 16 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_and_b32 s7, s10, 0xffff0000 ; SI-NEXT: s_or_b32 s46, s47, s46 ; SI-NEXT: s_and_b32 s47, s56, 0xffff -; SI-NEXT: v_writelane_b32 v43, s7, 52 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: v_readlane_b32 s25, v43, 3 +; SI-NEXT: s_and_b32 s26, s26, 0xff +; SI-NEXT: v_writelane_b32 v42, s7, 48 ; SI-NEXT: s_and_b32 s7, s8, 0xffff0000 ; SI-NEXT: s_or_b32 s56, s46, s47 ; SI-NEXT: s_add_i32 s47, s58, 0x3000000 ; SI-NEXT: s_add_i32 s58, s59, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s7, 53 +; SI-NEXT: s_addk_i32 s24, 0x300 +; SI-NEXT: s_lshl_b32 s25, s25, 24 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: v_writelane_b32 v42, s7, 49 ; SI-NEXT: s_lshl_b32 s7, s8, 16 ; SI-NEXT: s_add_i32 s57, s57, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s7, 54 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s25, s25, s26 +; SI-NEXT: v_writelane_b32 v42, s7, 50 ; SI-NEXT: s_and_b32 s7, s58, 0xffff0000 +; SI-NEXT: s_add_i32 s56, s56, 0x3000000 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: v_writelane_b32 v42, s7, 51 +; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s46, s60, 0x3000000 -; SI-NEXT: s_add_i32 s56, s56, 0x3000000 -; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 0x3000000 -; SI-NEXT: s_add_i32 s16, s16, 0x3000000 -; SI-NEXT: s_add_i32 s19, s19, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 0x3000000 ; SI-NEXT: s_add_i32 s24, s24, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s7, 55 -; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 +; SI-NEXT: v_writelane_b32 v42, s7, 52 +; SI-NEXT: s_and_b32 s7, s56, 0xffff0000 ; SI-NEXT: s_and_b32 s27, s24, 0xffff0000 ; SI-NEXT: s_lshl_b32 s26, s24, 16 -; SI-NEXT: s_and_b32 s65, s20, 0xffff0000 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_and_b32 s66, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s29, s23, 16 -; SI-NEXT: s_and_b32 s64, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s95, s22, 16 -; SI-NEXT: s_and_b32 s76, s21, 0xffff0000 +; SI-NEXT: s_and_b32 s76, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s23, 16 +; SI-NEXT: s_and_b32 s72, s21, 0xffff0000 ; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s77, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s22, s19, 16 -; SI-NEXT: s_and_b32 s78, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_and_b32 s88, s13, 0xffff0000 ; SI-NEXT: s_lshl_b32 s19, s13, 16 -; SI-NEXT: s_and_b32 s75, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s79, s12, 16 -; SI-NEXT: s_and_b32 s13, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s10, 16 +; SI-NEXT: s_and_b32 s59, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s12, 16 +; SI-NEXT: s_and_b32 s61, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s18, s11, 16 +; SI-NEXT: s_and_b32 s60, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s9, 16 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s63, s6, 0xffff0000 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s99, s58, 16 -; SI-NEXT: v_writelane_b32 v43, s7, 56 ; SI-NEXT: s_lshl_b32 s57, s57, 16 -; SI-NEXT: s_and_b32 s7, s56, 0xffff0000 -; SI-NEXT: s_lshl_b32 s8, s56, 16 -; SI-NEXT: s_and_b32 s56, s47, 0xffff0000 +; SI-NEXT: v_writelane_b32 v42, s7, 53 +; SI-NEXT: s_lshl_b32 s98, s56, 16 +; SI-NEXT: s_and_b32 s7, s47, 0xffff0000 ; SI-NEXT: s_lshl_b32 s23, s47, 16 ; SI-NEXT: s_and_b32 s47, s46, 0xffff0000 -; SI-NEXT: s_lshl_b32 s97, s46, 16 -; SI-NEXT: s_and_b32 s24, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s46, 16 +; SI-NEXT: s_and_b32 s46, s45, 0xffff0000 ; SI-NEXT: s_lshl_b32 s45, s45, 16 ; SI-NEXT: s_and_b32 s58, s44, 0xffff0000 -; SI-NEXT: s_lshl_b32 s28, s44, 16 -; SI-NEXT: s_and_b32 s73, s43, 0xffff0000 -; SI-NEXT: s_lshl_b32 s46, s43, 16 +; SI-NEXT: s_lshl_b32 s96, s44, 16 +; SI-NEXT: s_and_b32 s93, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s43, 16 ; SI-NEXT: s_and_b32 s67, s42, 0xffff0000 ; SI-NEXT: s_lshl_b32 s87, s42, 16 ; SI-NEXT: s_and_b32 s68, s41, 0xffff0000 -; SI-NEXT: s_lshl_b32 s42, s41, 16 +; SI-NEXT: s_lshl_b32 s78, s41, 16 ; SI-NEXT: s_and_b32 s70, s40, 0xffff0000 ; SI-NEXT: s_lshl_b32 s86, s40, 16 -; SI-NEXT: s_and_b32 s94, s15, 0xffff0000 +; SI-NEXT: s_and_b32 s24, s15, 0xffff0000 ; SI-NEXT: s_lshl_b32 s69, s15, 16 -; SI-NEXT: s_and_b32 s11, s14, 0xffff0000 +; SI-NEXT: s_and_b32 s25, s14, 0xffff0000 ; SI-NEXT: s_lshl_b32 s85, s14, 16 -; SI-NEXT: s_and_b32 s31, s5, 0xffff0000 +; SI-NEXT: s_and_b32 s82, s5, 0xffff0000 ; SI-NEXT: s_lshl_b32 s80, s5, 16 -; SI-NEXT: s_and_b32 s15, s4, 0xffff0000 +; SI-NEXT: s_and_b32 s28, s4, 0xffff0000 ; SI-NEXT: s_lshl_b32 s84, s4, 16 -; SI-NEXT: v_writelane_b32 v43, s7, 57 +; SI-NEXT: v_writelane_b32 v42, s7, 54 ; SI-NEXT: .LBB89_3: ; %end ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_readlane_b32 s4, v43, 49 +; SI-NEXT: v_readlane_b32 s4, v42, 46 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s29 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s21 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: v_readlane_b32 s4, v42, 47 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s18 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 50 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 51 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: v_readlane_b32 s4, v42, 48 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s10 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 52 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 53 +; SI-NEXT: v_readlane_b32 s4, v42, 49 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_readlane_b32 s4, v43, 54 +; SI-NEXT: v_readlane_b32 s4, v42, 50 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 55 +; SI-NEXT: v_readlane_b32 s4, v42, 51 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 @@ -155961,7 +155436,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 56 +; SI-NEXT: v_readlane_b32 s4, v42, 52 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 @@ -155969,17 +155444,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 57 +; SI-NEXT: v_readlane_b32 s4, v42, 53 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s98 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: v_readlane_b32 s4, v42, 54 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 @@ -155988,12 +155464,12 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s12 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 @@ -156002,14 +155478,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -156023,7 +155499,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -156035,28 +155511,28 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s85 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s82 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s80 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s84 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 @@ -156103,120 +155579,118 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: s_mov_b32 s7, s6 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: v_readlane_b32 s92, v44, 24 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: v_readlane_b32 s91, v44, 20 -; SI-NEXT: s_mov_b32 s90, s88 -; SI-NEXT: v_readlane_b32 s36, v44, 23 -; SI-NEXT: v_readlane_b32 s35, v44, 19 -; SI-NEXT: v_readlane_b32 s62, v44, 22 -; SI-NEXT: v_readlane_b32 s38, v44, 18 -; SI-NEXT: s_mov_b32 s34, s46 -; SI-NEXT: s_mov_b32 s93, s21 -; SI-NEXT: s_mov_b32 s37, s43 -; SI-NEXT: s_mov_b32 s39, s75 -; SI-NEXT: v_readlane_b32 s72, v44, 10 -; SI-NEXT: s_mov_b32 s50, s63 -; SI-NEXT: s_mov_b32 s51, s59 -; SI-NEXT: s_mov_b32 s48, s56 -; SI-NEXT: v_readlane_b32 s30, v44, 21 -; SI-NEXT: s_mov_b32 s49, s61 -; SI-NEXT: s_mov_b32 s52, s79 -; SI-NEXT: v_readlane_b32 s98, v44, 6 -; SI-NEXT: s_mov_b32 s55, s45 -; SI-NEXT: v_readlane_b32 s43, v44, 17 -; SI-NEXT: s_mov_b32 s60, s40 -; SI-NEXT: v_readlane_b32 s41, v44, 14 -; SI-NEXT: s_mov_b32 s53, s42 -; SI-NEXT: s_mov_b32 s54, s13 -; SI-NEXT: v_readlane_b32 s14, v44, 13 -; SI-NEXT: v_readlane_b32 s44, v44, 5 -; SI-NEXT: v_readlane_b32 s9, v44, 11 -; SI-NEXT: v_readlane_b32 s81, v44, 12 -; SI-NEXT: v_readlane_b32 s82, v44, 9 -; SI-NEXT: v_readlane_b32 s10, v44, 16 -; SI-NEXT: v_readlane_b32 s12, v44, 4 -; SI-NEXT: v_readlane_b32 s96, v44, 7 -; SI-NEXT: v_readlane_b32 s83, v44, 8 -; SI-NEXT: v_readlane_b32 s71, v44, 15 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_readlane_b32 s65, v43, 23 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: s_mov_b32 s92, s79 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: s_mov_b32 s91, s26 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_readlane_b32 s37, v43, 22 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: s_mov_b32 s36, s72 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: s_mov_b32 s53, s27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: s_mov_b32 s50, s24 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: s_mov_b32 s81, s46 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b32 s55, s63 +; SI-NEXT: s_mov_b32 s48, s45 +; SI-NEXT: v_readlane_b32 s73, v43, 14 +; SI-NEXT: s_mov_b32 s35, s62 +; SI-NEXT: s_mov_b32 s54, s61 +; SI-NEXT: s_mov_b32 s38, s90 +; SI-NEXT: s_mov_b32 s49, s66 +; SI-NEXT: s_mov_b32 s30, s42 +; SI-NEXT: s_mov_b32 s39, s58 +; SI-NEXT: v_readlane_b32 s44, v43, 10 +; SI-NEXT: s_mov_b32 s34, s10 +; SI-NEXT: s_mov_b32 s42, s43 +; SI-NEXT: s_mov_b32 s51, s40 +; SI-NEXT: v_readlane_b32 s41, v43, 18 +; SI-NEXT: v_readlane_b32 s52, v43, 21 +; SI-NEXT: s_mov_b32 s95, s60 +; SI-NEXT: v_readlane_b32 s14, v43, 17 +; SI-NEXT: v_readlane_b32 s11, v43, 9 +; SI-NEXT: v_readlane_b32 s9, v43, 15 +; SI-NEXT: v_readlane_b32 s56, v43, 16 +; SI-NEXT: v_readlane_b32 s15, v43, 13 +; SI-NEXT: v_readlane_b32 s7, v43, 8 +; SI-NEXT: v_readlane_b32 s8, v43, 7 +; SI-NEXT: v_readlane_b32 s97, v43, 11 +; SI-NEXT: v_readlane_b32 s83, v43, 12 +; SI-NEXT: v_readlane_b32 s77, v43, 19 +; SI-NEXT: v_readlane_b32 s71, v43, 20 +; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr95 ; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr21 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr31 ; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr59 ; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr99 ; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; kill: killed $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr23 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr93 ; SI-NEXT: ; implicit-def: $sgpr87 ; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr85 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr82 ; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: s_branch .LBB89_2 ; ; VI-LABEL: bitcast_v128i8_to_v64bf16_scalar: @@ -156238,19 +155712,19 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill @@ -156280,10 +155754,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v44, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v29 ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 @@ -156292,46 +155766,42 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v41, 8, v26 +; VI-NEXT: v_lshlrev_b32_e32 v40, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v30 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v31 +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 @@ -156340,55 +155810,40 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v36 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v38 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -156399,805 +155854,824 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:36 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 -; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB89_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff ; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_and_b32 s4, s20, 0xff +; VI-NEXT: s_lshl_b32 s5, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff ; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s24, 0xff +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s8, s4, s5 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v2, v8 -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v44, v8 +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v45, v10 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v3, v10 +; VI-NEXT: v_or_b32_sdwa v1, v1, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v38, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v39, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_or_b32_sdwa v1, v63, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_or_b32_sdwa v0, v39, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_or_b32_sdwa v1, v40, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v42, v43 -; VI-NEXT: v_mov_b32_e32 v43, v37 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v47, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v47, v54 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_or_b32_sdwa v0, v41, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v51, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v52, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v35, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v61, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v58, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v36, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v34, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v55, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v28, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v25, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v58, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v26, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v29, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v46, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v28, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v57, v38 +; VI-NEXT: v_or_b32_sdwa v0, v38, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v40, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v59, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v51, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v41 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v46, v1 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v56, v1 -; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v47, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v39 -; VI-NEXT: v_mov_b32_e32 v54, v33 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v57, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v62 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v52, v60 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v54, v63 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v53, v35 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 -; VI-NEXT: s_and_b32 s4, s16, 0xff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff -; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_branch .LBB89_3 ; VI-NEXT: .LBB89_2: -; VI-NEXT: v_mov_b32_e32 v47, v54 -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v58, v7 -; VI-NEXT: v_mov_b32_e32 v57, v5 -; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: v_mov_b32_e32 v57, v38 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB89_3: ; %Flow -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; VI-NEXT: s_cbranch_vccnz .LBB89_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s5, s27, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s9, s19, 8 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v29, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 -; VI-NEXT: v_or_b32_sdwa v30, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 -; VI-NEXT: v_or_b32_sdwa v28, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v44, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v27, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v63 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v40, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v26, v61, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v34, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 -; VI-NEXT: v_or_b32_sdwa v26, v26, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v28, v53, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v21, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v21 -; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v14, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v24, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v27, v49, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v32, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v15, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v23, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v26, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 +; VI-NEXT: v_or_b32_sdwa v37, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: v_or_b32_sdwa v26, v26, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v38, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v24, v36, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v46 +; VI-NEXT: v_or_b32_sdwa v39, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v15 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v14 +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 +; VI-NEXT: v_or_b32_sdwa v24, v24, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v25, v25, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v27, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v28, v28, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v61, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v61 -; VI-NEXT: v_or_b32_sdwa v23, v23, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v23, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v13 +; VI-NEXT: v_or_b32_sdwa v29, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v48, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: v_or_b32_sdwa v23, v23, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v36, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 -; VI-NEXT: v_or_b32_sdwa v22, v22, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v22, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v49, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 +; VI-NEXT: v_or_b32_sdwa v22, v22, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v21, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v63, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v38, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 -; VI-NEXT: v_or_b32_sdwa v21, v63, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v50, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v20, v45, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 +; VI-NEXT: v_or_b32_sdwa v51, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v40 +; VI-NEXT: v_or_b32_sdwa v19, v43, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v52, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v52 +; VI-NEXT: v_or_b32_sdwa v19, v19, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v20, v20, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v21, v21, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 -; VI-NEXT: v_or_b32_sdwa v39, v45, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v19, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v48, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 -; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v62, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v62 -; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v53, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x300, v53 +; VI-NEXT: v_or_b32_sdwa v18, v18, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v51 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v50 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v49, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 -; VI-NEXT: v_or_b32_sdwa v15, v15, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_or_b32_sdwa v54, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x300, v54 +; VI-NEXT: v_or_b32_sdwa v17, v17, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v17 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v16, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 -; VI-NEXT: v_or_b32_sdwa v14, v14, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v2 -; VI-NEXT: v_or_b32_sdwa v29, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: v_or_b32_sdwa v55, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x300, v55 +; VI-NEXT: v_or_b32_sdwa v16, v16, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v16 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v13, v59, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v52, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v52 -; VI-NEXT: v_or_b32_sdwa v13, v13, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v44 -; VI-NEXT: v_or_b32_sdwa v28, v28, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v40, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v41, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v41 +; VI-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v1 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_or_b32_sdwa v42, v58, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v42 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v54, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v54, vcc, 0x300, v54 -; VI-NEXT: v_or_b32_sdwa v12, v12, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_or_b32_sdwa v12, v47, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_or_b32_sdwa v43, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v40, vcc, 0x300, v43 +; VI-NEXT: v_or_b32_sdwa v12, v12, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v50, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_or_b32_sdwa v11, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v44, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v44 +; VI-NEXT: v_or_b32_sdwa v11, v11, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v53, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v10, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v55, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v45, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v10, v10, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v9, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v10 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 -; VI-NEXT: v_or_b32_sdwa v49, v16, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v53, vcc, 0x300, v40 -; VI-NEXT: v_or_b32_sdwa v27, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v46, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v46 +; VI-NEXT: v_or_b32_sdwa v9, v9, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v8, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v43, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v43 -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v11 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v41 -; VI-NEXT: v_or_b32_sdwa v17, v17, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v11, v50, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v49 -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v0 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v30, v30, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v47, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v44, vcc, 0x300, v47 +; VI-NEXT: v_or_b32_sdwa v8, v8, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v7, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v56, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v56 ; VI-NEXT: v_or_b32_sdwa v7, v7, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v6, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_or_b32_sdwa v57, v32, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v57 ; VI-NEXT: v_or_b32_sdwa v6, v6, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v5, v32, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v47, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v58, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v3 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v47, vcc, 0x300, v47 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v47, vcc, 0x300, v58 ; VI-NEXT: v_or_b32_sdwa v5, v5, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v30, v30, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 -; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v56, vcc, 0x300, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v4, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v56, vcc, 3, v56 -; VI-NEXT: v_or_b32_sdwa v56, v57, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v56, s4, v56 -; VI-NEXT: s_and_b32 s4, s26, 0xff -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_and_b32 s5, s24, 0xff -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_or_b32 s7, s8, s7 -; VI-NEXT: s_and_b32 s8, s18, 0xff -; VI-NEXT: s_or_b32 s8, s9, s8 -; VI-NEXT: s_and_b32 s9, s16, 0xff -; VI-NEXT: s_or_b32 s9, s10, s9 -; VI-NEXT: s_addk_i32 s5, 0x300 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s8, s8, 0x3000000 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v56 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v57, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 ; VI-NEXT: .LBB89_5: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -157237,35 +156711,31 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 @@ -157276,268 +156746,284 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v29 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 ; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v56 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v52 ; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v45 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v51 ; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v55 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v53 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v20 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v26 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v28 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 -; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v24 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(29) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:248 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 -; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v37 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:312 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:328 -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:36 ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:44 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v1 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:100 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:68 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:108 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(55) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(60) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB89_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -157545,722 +157031,725 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v3, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s21, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v57, v5 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v34, v35 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s4, s5 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v46, v32 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v17, v45, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v45, v59 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v55, v22 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v16, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v47, v32 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_mov_b32_e32 v33, v35 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v51, v57 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v49, v39 -; GFX9-NEXT: v_mov_b32_e32 v59, v44 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v58, v50 -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v54, v63 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v41, v39 +; GFX9-NEXT: v_mov_b32_e32 v39, v54 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v52, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v54, v32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v29 +; GFX9-NEXT: v_mov_b32_e32 v34, v37 +; GFX9-NEXT: v_mov_b32_e32 v37, v53 +; GFX9-NEXT: v_mov_b32_e32 v53, v36 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v48, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v36, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v62, v61 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v59, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v57, v35 -; GFX9-NEXT: v_mov_b32_e32 v35, v38 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: s_branch .LBB89_3 ; GFX9-NEXT: .LBB89_2: -; GFX9-NEXT: v_mov_b32_e32 v58, v50 -; GFX9-NEXT: v_mov_b32_e32 v45, v59 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v34, v35 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v49, v39 -; GFX9-NEXT: v_mov_b32_e32 v55, v22 -; GFX9-NEXT: v_mov_b32_e32 v51, v5 +; GFX9-NEXT: v_mov_b32_e32 v58, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v40 +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v46, v32 +; GFX9-NEXT: v_mov_b32_e32 v35, v32 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB89_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB89_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v16, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v16, v45, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v15, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v15, v46, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_addk_i32 s4, 0x300 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s24, s24, 3 -; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 ; GFX9-NEXT: s_add_i32 s26, s26, 3 -; GFX9-NEXT: s_lshl_b32 s6, s27, 8 -; GFX9-NEXT: s_add_i32 s20, s20, 3 -; GFX9-NEXT: s_lshl_b32 s7, s21, 8 -; GFX9-NEXT: s_add_i32 s22, s22, 3 -; GFX9-NEXT: s_lshl_b32 s8, s23, 8 -; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_lshl_b32 s9, s17, 8 -; GFX9-NEXT: s_add_i32 s18, s18, 3 -; GFX9-NEXT: s_lshl_b32 s10, s19, 8 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 -; GFX9-NEXT: v_or_b32_sdwa v23, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 -; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: s_and_b32 s4, s24, 0xff -; GFX9-NEXT: s_or_b32 s4, s5, s4 -; GFX9-NEXT: s_and_b32 s5, s26, 0xff ; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 ; GFX9-NEXT: s_or_b32 s6, s7, s6 -; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 ; GFX9-NEXT: s_or_b32 s7, s8, s7 -; GFX9-NEXT: s_and_b32 s8, s16, 0xff +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_or_b32 s8, s9, s8 -; GFX9-NEXT: s_and_b32 s9, s18, 0xff +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_or_b32 s9, s10, s9 -; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: s_or_b32 s10, s11, s10 ; GFX9-NEXT: s_addk_i32 s5, 0x300 ; GFX9-NEXT: s_addk_i32 s6, 0x300 ; GFX9-NEXT: s_addk_i32 s7, 0x300 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_addk_i32 s9, 0x300 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v4, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v24, v59, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v16 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v7, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v8, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v28, v48, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v28 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v9, 3, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v37, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 +; GFX9-NEXT: v_or_b32_sdwa v38, v38, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v10, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v37, v37, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v11, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v39, v39, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v12, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v48, v52, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v48 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v13, v51, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v14, 3, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v38, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v14, v47, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v17, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v18, 3, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v39, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v18, v43, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v48, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v19, v44, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v20, 3, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v20, v42, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v50, v34, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v51, v33, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v52, v63, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v53 +; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v25 +; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v14 +; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v23 +; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v18 +; GFX9-NEXT: v_add_u32_e32 v50, 0x300, v50 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v53, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v54, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v40, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v41, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v42, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v2 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v49, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v43, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v43 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v58 -; GFX9-NEXT: v_or_b32_sdwa v19, v51, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v51, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v44, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v52, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v45, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v62 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v46, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v47, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v56 +; GFX9-NEXT: v_add_u32_e32 v49, 0x300, v19 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v55 +; GFX9-NEXT: v_add_u32_e32 v55, 0x300, v44 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v46 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v47 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v56, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v58 +; GFX9-NEXT: v_or_b32_sdwa v57, v57, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v35 +; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v56 +; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v27 +; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v37 +; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v39 +; GFX9-NEXT: v_add_u32_e32 v39, 0x300, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v58, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v15, 0x300, v58 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v59, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v36 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v38 +; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v13 +; GFX9-NEXT: v_add_u32_e32 v13, 0x300, v57 +; GFX9-NEXT: v_add_u32_e32 v59, 0x300, v59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v34 -; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v35, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 -; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 +; GFX9-NEXT: v_or_b32_sdwa v60, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v60 -; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 -; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 -; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v41, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v46 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v36, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v22 -; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v52 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v42, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_or_b32_sdwa v24, v57, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 -; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 -; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 -; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 -; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 +; GFX9-NEXT: v_or_b32_sdwa v61, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v58, 0x300, v61 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 -; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v62, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v41, 0x300, v62 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v63, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v57, 0x300, v63 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v45, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v23 -; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v25 -; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v38 -; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v50 -; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v39 -; GFX9-NEXT: v_add_u32_e32 v39, 0x300, v49 -; GFX9-NEXT: v_add_u32_e32 v49, 0x300, v53 -; GFX9-NEXT: v_add_u32_e32 v50, 0x300, v55 -; GFX9-NEXT: v_add_u32_e32 v53, 0x300, v45 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v56, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v5 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v43, 0x300, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v7 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v44, 0x300, v6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v8 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v7 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v9 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v45, 0x300, v8 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v7, v45, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v10 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v9 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v11 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v46, 0x300, v10 +; GFX9-NEXT: v_lshl_or_b32 v6, v46, 16, v6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v12 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v22 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v51 +; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v52 +; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v54 +; GFX9-NEXT: v_add_u32_e32 v54, 0x300, v42 +; GFX9-NEXT: v_add_u32_e32 v42, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v5 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v8, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; GFX9-NEXT: v_lshl_or_b32 v9, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; GFX9-NEXT: v_lshl_or_b32 v10, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v20 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v53 +; GFX9-NEXT: v_add_u32_e32 v53, 0x300, v40 +; GFX9-NEXT: v_add_u32_e32 v40, 0x300, v60 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; GFX9-NEXT: v_add_u32_e32 v47, 0x300, v12 +; GFX9-NEXT: v_lshl_or_b32 v12, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v5, v47, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_lshl_or_b32 v16, v55, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v54, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 -; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 ; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 ; GFX9-NEXT: v_lshl_or_b32 v26, v37, 16, v26 ; GFX9-NEXT: v_lshl_or_b32 v27, v36, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 ; GFX9-NEXT: .LBB89_5: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -158491,14 +157980,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB89_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_and_b32 v1, 0xff, v35 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 @@ -158508,186 +157992,170 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s5, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s8 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s29, 8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v64 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v66 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v65 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v35 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v118 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v67 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v68 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v69 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v67 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v69 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v1, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v80 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v50 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v81 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v81 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v2, v71 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v51 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v83 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v3, v86 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v84 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v1.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v85 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v96 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v3, v86 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v54 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v98 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v87 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v99 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v97 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v102 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v101 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v1, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v114 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v113 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v112 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v116 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v128 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v134 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v133 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v134 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v161 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v129 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v147 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v166 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v144 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v147 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v3, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v148 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v149 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v151 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v2, v166 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v177 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v180 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v149 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v165 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v42 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v180 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v115 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v1, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v44 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v45 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v44 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v3, v45 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v135 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v145 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v59 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v56 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v60 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v3, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v150 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v160 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v62 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v2, v63 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v72 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v73 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v160 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v176 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v75 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v1, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v176 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v179 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v75 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v76 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v77 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v3, v77 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v40 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v43 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v78 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v79 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v89 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v40 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v91 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v88 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v3, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v47 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v58 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v93 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v3, v91 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v92 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v57 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v93 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v0.l -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v1, v92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v2.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB89_3 ; GFX11-TRUE16-NEXT: .LBB89_2: ; %cmp.true @@ -159263,233 +158731,211 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB89_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-FAKE16-NEXT: v_and_b32_e64 v5, 0xffff, s5 ; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-FAKE16-NEXT: s_or_b32 s11, s11, s12 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s26, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s11 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v66 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v68 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v4, v67 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v64 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v49 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v6, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v6, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v85 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v9, v83 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v100 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v96 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v11, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v99 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v12, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v97 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v13, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v12, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v14, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v15, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v15, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v16, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v17, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v165 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v42 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v18, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v19, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v20, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v131 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v145 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v59 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v21, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v22, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v23, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v24, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v163 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v176 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v75 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v24, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v25, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v26, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v27, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v183 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v43 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v78 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v89 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v27, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v28, v88 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v58 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v29, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v30, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v46 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v92 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v90 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB89_3 ; GFX11-FAKE16-NEXT: .LBB89_2: ; %cmp.true @@ -159917,7 +159363,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 @@ -159933,126 +159379,128 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v30 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v29 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v36 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v36 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v37 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v39 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v49 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 @@ -160060,83 +159508,83 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v48 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v55 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:124 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; kill: killed $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v53 ; SI-NEXT: v_mul_f32_e32 v53, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v63 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v58 +; SI-NEXT: ; kill: killed $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v1 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v3 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v4 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v5 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v7 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v10 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v6 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -160309,9 +159757,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v3 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -160322,634 +159770,643 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_alignbit_b32 v38, v1, v2, 16 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v35, v1, v2, 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v36, v1, v2, 16 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v32, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v33, v1, v2, 16 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v29, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v30, v1, v2, 16 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v26, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v40, v1, v2, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v27, v1, v2, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v54, v1, v2, 16 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, v1, v2, 16 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v18, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v19, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v52, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v40, v1, v2, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v15, v1, v57, 16 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v17, v1, v63, 16 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v50, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v54, v1, v2, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v13, v1, v61, 16 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_alignbit_b32 v15, v1, v10, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v52, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_alignbit_b32 v12, v1, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v50, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_alignbit_b32 v9, v1, v37, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_alignbit_b32 v48, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v11, v1, v60, 16 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v37, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_alignbit_b32 v6, v1, v39, 16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v38, v1, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v8, v1, v51, 16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_alignbit_b32 v5, v1, v53, 16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v34, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v35, v1, v2, 16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_alignbit_b32 v5, v1, v53, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_alignbit_b32 v4, v1, v57, 16 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v31, v1, v2, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v32, v1, v2, 16 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_alignbit_b32 v4, v1, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_alignbit_b32 v3, v1, v13, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v29, v1, v2, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58 +; SI-NEXT: v_alignbit_b32 v2, v1, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_alignbit_b32 v26, v1, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_alignbit_b32 v1, v1, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v25 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v28, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v24, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_alignbit_b32 v3, v1, v41, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v46 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v25, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v21, v7, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; SI-NEXT: v_alignbit_b32 v2, v1, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_alignbit_b32 v22, v1, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v20, v7, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v56 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v18, v7, v8, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v44 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v59 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v14, v7, v23, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v40, v38, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v41 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v11, v7, v28, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v40, v38, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v42 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v7, v49, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v40, v38, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v60 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v54, v35, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v63 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v54, v35, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v10 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v54, v35, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v52, v32, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v34 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v52, v32, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v37 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v52, v32, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v50, v29, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v51 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v50, v29, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v45 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v50, v29, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v61 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v48, v26, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v25 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v48, v26, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v46 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v48, v26, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v59 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v37, v23, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v41 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v37, v23, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v42 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v37, v23, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v40, v36, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v34, v18, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v40, v36, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v34, v18, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v40, v36, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v34, v18, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v54, v33, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v54, v33, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v54, v33, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v52, v30, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v28, v13, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v52, v30, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v28, v13, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v52, v30, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v28, v13, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v50, v27, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v25, v11, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v50, v27, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v25, v11, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v50, v27, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v25, v11, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v48, v22, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v22, v8, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v48, v22, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v22, v8, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v48, v22, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v22, v8, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v38, v19, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v20, v5, 24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v38, v19, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v19 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v38, v19, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v20, v5, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v35, v17, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v20, v5, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_alignbit_b32 v17, v7, v9, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24 +; SI-NEXT: v_alignbit_b32 v7, v35, v17, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v35, v17, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v17, v4, 16 -; SI-NEXT: v_alignbit_b32 v14, v7, v27, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v32, v15, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v17, v4, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v32, v15, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v14, v3, 24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v32, v15, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v29, v12, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v14, v3, 16 -; SI-NEXT: v_alignbit_b32 v10, v7, v39, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v29, v12, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v14, v3, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v29, v12, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v10, v2, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v26, v9, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v26, v9, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v44 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v26, v9, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v10, v2, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v43, 16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v24, v6, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v7, v55, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v24, v6, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v10, v2, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v24, v6, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v21, v5, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v21, v5, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v21, v5, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v40 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v18, v4, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v54 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v18, v4, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v52 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v18, v4, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v50 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v14, v3, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v48 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v14, v3, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v37 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v14, v3, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v11, v2, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v31 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v11, v2, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v28 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v11, v2, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v25 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v8, v1, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v22 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v8, v1, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v20 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v8, v1, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v17 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v40 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v14 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v54 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v10 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v52 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v7 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v58 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v50 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v33 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v48 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v47 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v38 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v57 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v35 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v42 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v32 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v59 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v29 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v49 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v26 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v51 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v62 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v21 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v45 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v18 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v24 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19 -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v36 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v14 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v11 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v56 ; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: .LBB90_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v62 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v5, v8, v5, 16 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v47 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v36 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v8, v11, v8, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_alignbit_b32 v6, v9, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v11, v14, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_alignbit_b32 v9, v12, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v13 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v17, v13, 16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v19 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v12, v15, v12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v17, v17, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_alignbit_b32 v15, v15, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v18, v10, v17, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v63 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_alignbit_b32 v17, v21, v17, 16 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v16 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v48 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 ; SI-NEXT: s_waitcnt vmcnt(4) @@ -160958,830 +160415,820 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v54 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v48 ; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v50 ; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v52 ; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v54 ; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v40 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v49 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v51 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v53 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v55 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v41 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v20, v15, 16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v21, v21, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v10, 24, v10 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v23, v18, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 24, v20 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v21, 24, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v10, 24, v13 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v26, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v19, v24, v19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v24, v24, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v23 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v29, v26, 16 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v22, v27, v22, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v25 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v32, v29, 16 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v27, v30, v27, 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v28, 24, v28 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_alignbit_b32 v32, v35, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v34, v35, v34, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_alignbit_b32 v30, v33, v30, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v32, v33, v32, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v35 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v35 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; SI-NEXT: v_alignbit_b32 v35, v38, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v36 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v37, v38, v37, 16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_alignbit_b32 v33, v36, v33, 16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v34 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v35, v36, v35, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v36 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v34 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; SI-NEXT: v_alignbit_b32 v38, v49, v38, 16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v39 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v48, v49, v48, 16 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; SI-NEXT: v_alignbit_b32 v36, v39, v36, 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v38, v39, v38, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v39 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v37 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; SI-NEXT: v_alignbit_b32 v50, v50, v49, 16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: v_alignbit_b32 v48, v48, v39, 16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; SI-NEXT: v_alignbit_b32 v52, v52, v49, 16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: v_alignbit_b32 v50, v50, v39, 16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; SI-NEXT: v_alignbit_b32 v54, v54, v49, 16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: v_alignbit_b32 v52, v52, v39, 16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; SI-NEXT: v_alignbit_b32 v40, v40, v49, 16 -; SI-NEXT: v_alignbit_b32 v6, v40, v38, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: v_alignbit_b32 v54, v54, v39, 16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: v_alignbit_b32 v40, v40, v39, 16 +; SI-NEXT: v_alignbit_b32 v7, v40, v36, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v40, v38, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v40, v36, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v40, v38, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v40, v36, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v54, v35, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v54, v33, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v54, v35, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v54, v33, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v54, v35, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v54, v33, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v52, v32, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v52, v30, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v52, v32, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v52, v30, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v52, v32, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v52, v30, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v50, v29, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v50, v27, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v50, v29, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v50, v27, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v50, v29, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v50, v27, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v48, v26, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v48, v22, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v48, v26, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v48, v22, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v48, v26, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v48, v22, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v37, v23, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v38, v19, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v37, v23, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v38, v19, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v37, v23, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v38, v19, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v34, v18, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v35, v17, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v34, v18, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v35, v17, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v34, v18, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v35, v17, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v32, v15, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v32, v15, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v32, v15, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v28, v13, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v29, v12, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v28, v13, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v29, v12, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v28, v13, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v29, v12, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v25, v11, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v26, v9, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v25, v11, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v26, v9, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v25, v11, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v26, v9, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v22, v8, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v24, v6, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v22, v8, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v24, v6, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v22, v8, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v24, v6, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v20, v5, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v21, v5, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v20, v5, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v21, v5, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v20, v5, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v21, v5, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v7, v18, v4, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v18, v4, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v17, v4, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v18, v4, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v17, v4, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v14, v3, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v14, v3, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v14, v3, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v14, v3, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v14, v3, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v14, v3, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v11, v2, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v10, v2, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v11, v2, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v10, v2, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v11, v2, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v10, v2, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v8, v1, 24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v8, v1, 16 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v8, v1, 8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v40 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v40 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v54 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v54 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v52 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v52 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v50 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v50 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v48 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v48 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v38 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v37 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v35 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v32 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v31 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v29 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v28 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v26 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v25 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v24 +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v41 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v22 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v21 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v41 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v55 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v17 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v18 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v55 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v53 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v14 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v14 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v53 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v51 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v10 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v11 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v51 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v49 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v7 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v8 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: .LBB90_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v40 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v35 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v33 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v54 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v54 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v30 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v52 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v29 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v27 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v50 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v50 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v26 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v22 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v48 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v48 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v23 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v19 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v37 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v38 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v18 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v17 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v34 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v35 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v15 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v15 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v31 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v32 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v13, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v13 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v28 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v29 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v25 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xff, v21 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 @@ -161792,14 +161239,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v47 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v60 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 @@ -161809,12 +161256,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 @@ -161825,14 +161272,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v33 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 @@ -161842,12 +161291,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -161858,14 +161307,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v58 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -161875,12 +161326,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -161891,15 +161342,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -161934,6 +161385,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -161950,100 +161402,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -162055,7 +161420,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -162067,7 +161433,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -162079,13 +161446,18 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -162097,7 +161469,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -162109,7 +161482,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 @@ -162121,216 +161495,281 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; kill: killed $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB90_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v29 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v28 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16] -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v28 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14] -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v27 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v26 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[11:12] -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v25 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[9:10] -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v24 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v24 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[7:8] -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v23 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v22 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6] -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v22 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[3:4] -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v21 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v20 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[1:2] -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v20 ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v19 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v18 -; VI-NEXT: v_mov_b32_e32 v45, v46 -; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v31 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v17 -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v46, v63 -; VI-NEXT: v_mov_b32_e32 v63, v50 -; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v32 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v44, v46 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[13:14] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; VI-NEXT: v_mov_b32_e32 v51, v57 -; VI-NEXT: v_mov_b32_e32 v50, v56 -; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26] +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 -; VI-NEXT: v_mov_b32_e32 v57, v43 -; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[23:24] +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[11:12] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 -; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[21:22] +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[7:8] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v9 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[3:4] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[1:2] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[19:20] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[29:30] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[17:18] -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[27:28] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[25:26] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v30 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[23:24] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; VI-NEXT: v_mov_b32_e32 v47, v34 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[21:22] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[19:20] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v24 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v23 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v22 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v21 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v17 ; VI-NEXT: .LBB90_2: ; %Flow -; VI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: s_xor_b64 exec, exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB90_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 ; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 ; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 @@ -162907,227 +162346,225 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_alignbit_b32 v15, v15, v33, 16 ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v28 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v28 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v27 -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v26 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v25 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v21 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v10 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v10 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v9 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v7 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v18 -; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v31 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v30 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v29 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v17 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v24 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v23 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v22 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v21 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v17 ; VI-NEXT: .LBB90_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v37 +; VI-NEXT: v_or_b32_sdwa v1, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v44 -; VI-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v43 -; VI-NEXT: v_or_b32_sdwa v2, v2, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v43 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v59 +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v43, v44, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v59, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -163138,90 +162575,115 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v60 -; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v51 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41 -; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v50 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v46 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload @@ -163237,18 +162699,17 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload @@ -163264,140 +162725,96 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v34 ; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56 ; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 ; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 ; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 ; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -163408,9 +162825,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -163421,39 +162838,57 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 -; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -163477,6 +162912,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-LABEL: bitcast_v64bf16_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -163493,1506 +162934,1404 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: v_mov_b32_e32 v46, v0 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v44, v15 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v47, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; kill: killed $vgpr0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; kill: killed $vgpr0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; kill: killed $vgpr0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; kill: killed $vgpr0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; kill: killed $vgpr0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; kill: killed $vgpr0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; kill: killed $vgpr0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; kill: killed $vgpr0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; kill: killed $vgpr0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; kill: killed $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; kill: killed $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; kill: killed $vgpr0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: v_mov_b32_e32 v30, v18 +; GFX9-NEXT: v_mov_b32_e32 v29, v17 +; GFX9-NEXT: v_mov_b32_e32 v45, v16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr46 ; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; kill: killed $vgpr59 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; kill: killed $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; kill: killed $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; kill: killed $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; kill: killed $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; kill: killed $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; kill: killed $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; kill: killed $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; kill: killed $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; kill: killed $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; kill: killed $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; kill: killed $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB90_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v63 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v63 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v63 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(44) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v62 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v62 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v7 -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6 -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(35) -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[15:16] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v16 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[11:12] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v16 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[1:2] +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_mov_b32_e32 v37, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 24, v14 +; GFX9-NEXT: v_mov_b32_e32 v36, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 24, v28 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; GFX9-NEXT: v_mov_b32_e32 v54, v40 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; GFX9-NEXT: v_lshrrev_b64 v[48:49], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v30 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v29 +; GFX9-NEXT: v_lshrrev_b64 v[62:63], 24, v[29:30] +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v9 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v7 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v5 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[62:63] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v15 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[27:28] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v14 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[25:26] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(40) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v13 -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(46) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v18 +; GFX9-NEXT: v_lshrrev_b64 v[31:32], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v17 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[44:45] +; GFX9-NEXT: v_lshrrev_b64 v[31:32], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v12 -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v10 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v27 +; GFX9-NEXT: v_mov_b32_e32 v13, v36 +; GFX9-NEXT: v_lshrrev_b64 v[31:32], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v26 +; GFX9-NEXT: v_mov_b32_e32 v14, v37 +; GFX9-NEXT: v_lshrrev_b64 v[36:37], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v2, v39 +; GFX9-NEXT: v_mov_b32_e32 v37, v38 +; GFX9-NEXT: v_lshrrev_b64 v[38:39], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v26 +; GFX9-NEXT: v_lshrrev_b64 v[31:32], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v17 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; GFX9-NEXT: v_lshrrev_b64 v[31:32], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v29 ; GFX9-NEXT: .LBB90_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB90_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 ; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s6 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_bfe_u32 v31, v18, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v47, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v31, v31, v18, s6 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v56, v31, v32, vcc -; GFX9-NEXT: v_bfe_u32 v31, v18, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_add3_u32 v31, v31, v18, s6 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v57, v31, v32, vcc -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: s_mov_b32 s7, 0x7060302 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v31, vcc -; GFX9-NEXT: v_perm_b32 v13, v17, v57, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v31, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_perm_b32 v14, v56, v47, s7 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v20, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v19 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v32, v13, v0, s7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v20, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_perm_b32 v31, v17, v0, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v22 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v31, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v18, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v31, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v21 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v14, v13, v0, s7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v13, v17, v0, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v24 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v20, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v16, v31, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v50, v16, v19, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v19, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v23 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v14, v13, v0, s7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v13, v17, v0, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v26 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v25 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v25 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v14, v13, v0, s7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v13, v17, v0, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v28 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v27 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_perm_b32 v60, v17, v25, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v30 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v29 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: v_perm_b32 v33, v17, v24, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v63 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v63 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: s_waitcnt vmcnt(50) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v62 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v44, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v62 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v18, v19, vcc -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v22, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v21 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v39, v16, v31, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v58, v16, v21, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v21, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v24, vcc +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v35, v17, v23, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v43, v18, v19, vcc -; GFX9-NEXT: v_add3_u32 v17, v17, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v17, v18, vcc -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_add3_u32 v17, v17, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc -; GFX9-NEXT: v_perm_b32 v37, v1, v22, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v2, v17, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v2, v4, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v2, v4, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_perm_b32 v48, v1, v19, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_perm_b32 v50, v1, v4, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v7, vcc -; GFX9-NEXT: v_perm_b32 v52, v1, v3, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v7, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_mov_b32_e32 v59, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_mov_b32_e32 v58, v31 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v31, vcc -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v31, vcc -; GFX9-NEXT: v_perm_b32 v39, v1, v2, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_bfe_u32 v12, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v31, vcc -; GFX9-NEXT: v_add3_u32 v12, v12, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v31, vcc -; GFX9-NEXT: v_bfe_u32 v31, v1, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v31, v31, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v31, v32, vcc -; GFX9-NEXT: v_bfe_u32 v31, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v31, v31, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v31, v32, vcc -; GFX9-NEXT: v_perm_b32 v54, v11, v1, s7 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_bfe_u32 v31, v11, 16, 1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v16, v32, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v62, v16, v23, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v46, v16, v23, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v16, v26, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v57, v16, v33, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v16, v25, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v16, v25, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v16, v28, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v42, v16, v34, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v60, v16, v27, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v16, v27, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v16, v30, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v59, v16, v35, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add3_u32 v31, v31, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v35, v16, v29, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v31, v32, vcc -; GFX9-NEXT: v_bfe_u32 v31, v14, 16, 1 -; GFX9-NEXT: v_add3_u32 v31, v31, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_bfe_u32 v15, v14, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v16, v29, vcc +; GFX9-NEXT: v_add3_u32 v15, v15, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v15, v16, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s6 -; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v61, v28, v0, s7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v32, v41, vcc -; GFX9-NEXT: v_add3_u32 v31, v31, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v61, v16, v36, vcc +; GFX9-NEXT: v_add3_u32 v15, v15, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc -; GFX9-NEXT: v_perm_b32 v41, v13, v0, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v36, v15, v16, vcc +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_add3_u32 v31, v31, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX9-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v15, v15, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc -; GFX9-NEXT: v_bfe_u32 v31, v16, 16, 1 -; GFX9-NEXT: v_add3_u32 v31, v31, v16, s6 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v15 -; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_bfe_u32 v26, v31, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v15, v16, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_add3_u32 v26, v26, v31, s6 -; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_bfe_u32 v31, v15, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v26, v45, vcc -; GFX9-NEXT: v_add3_u32 v31, v31, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v15 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_bfe_u32 v15, v1, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v63, v16, v37, vcc +; GFX9-NEXT: v_add3_u32 v15, v15, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v37, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v15, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v15, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_bfe_u32 v15, v3, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v40, v16, v38, vcc +; GFX9-NEXT: v_add3_u32 v15, v15, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v38, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v15, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v15, v15, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v15, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v16, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_bfe_u32 v15, v5, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v54, v16, v48, vcc +; GFX9-NEXT: v_add3_u32 v15, v15, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v48, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v15, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v15, v15, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v15, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v15, v16, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v31, v45, vcc -; GFX9-NEXT: v_perm_b32 v32, v16, v13, s7 -; GFX9-NEXT: v_perm_b32 v31, v15, v26, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v14 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v51, v16, v49, vcc +; GFX9-NEXT: v_add3_u32 v15, v15, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v16, v7, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v16, v16, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v49, vcc +; GFX9-NEXT: v_bfe_u32 v16, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v49, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_bfe_u32 v49, v16, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_add3_u32 v49, v49, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_bfe_u32 v16, v9, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc +; GFX9-NEXT: v_add3_u32 v16, v16, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v16, v53, vcc +; GFX9-NEXT: v_bfe_u32 v53, v9, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v53, v53, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v53, v41, vcc +; GFX9-NEXT: v_bfe_u32 v53, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v53, v53, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v53, v41, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_bfe_u32 v41, v53, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_add3_u32 v41, v41, v53, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_bfe_u32 v53, v11, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v34, v41, v43, vcc +; GFX9-NEXT: v_add3_u32 v53, v53, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v53, v41, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_bfe_u32 v41, v53, 16, 1 +; GFX9-NEXT: v_add3_u32 v41, v41, v53, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v41, v43, vcc +; GFX9-NEXT: v_bfe_u32 v41, v53, 16, 1 +; GFX9-NEXT: v_add3_u32 v41, v41, v53, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v41, v43, vcc +; GFX9-NEXT: v_bfe_u32 v41, v53, 16, 1 +; GFX9-NEXT: v_add3_u32 v41, v41, v53, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v41, v43, vcc +; GFX9-NEXT: v_bfe_u32 v41, v53, 16, 1 +; GFX9-NEXT: v_add3_u32 v41, v41, v53, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 16, v45 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v41, v43, vcc +; GFX9-NEXT: v_bfe_u32 v41, v53, 16, 1 +; GFX9-NEXT: v_add3_u32 v41, v41, v53, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v45 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v41, v43, vcc +; GFX9-NEXT: v_bfe_u32 v41, v53, 16, 1 +; GFX9-NEXT: v_add3_u32 v41, v41, v53, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 16, v44 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v41, v43, vcc +; GFX9-NEXT: v_bfe_u32 v41, v53, 16, 1 +; GFX9-NEXT: v_add3_u32 v41, v41, v53, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v44 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_cndmask_b32_e32 v55, v41, v43, vcc +; GFX9-NEXT: v_bfe_u32 v41, v53, 16, 1 +; GFX9-NEXT: v_add3_u32 v41, v41, v53, s6 +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_perm_b32 v45, v14, v18, s6 +; GFX9-NEXT: v_mov_b32_e32 v14, v39 +; GFX9-NEXT: v_mov_b32_e32 v44, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v52, v41, v43, vcc +; GFX9-NEXT: v_perm_b32 v43, v58, v14, s6 +; GFX9-NEXT: v_perm_b32 v58, v62, v44, s6 +; GFX9-NEXT: v_perm_b32 v62, v35, v59, s6 +; GFX9-NEXT: v_perm_b32 v35, v16, v49, s6 +; GFX9-NEXT: v_perm_b32 v16, v0, v32, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v31, v31, v1, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v32 +; GFX9-NEXT: v_perm_b32 v32, v2, v3, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v34, v30, v27, s7 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v36, v44, v29, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; GFX9-NEXT: v_perm_b32 v38, v21, v43, s7 -; GFX9-NEXT: v_perm_b32 v49, v18, v20, s7 -; GFX9-NEXT: v_perm_b32 v53, v8, v5, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v20 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v43 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v51, v6, v17, s7 -; GFX9-NEXT: v_perm_b32 v40, v10, v7, s7 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 +; GFX9-NEXT: v_perm_b32 v56, v50, v17, s6 +; GFX9-NEXT: v_perm_b32 v50, v48, v54, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; GFX9-NEXT: v_perm_b32 v54, v13, v19, s6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v55 +; GFX9-NEXT: v_perm_b32 v41, v33, v57, s6 +; GFX9-NEXT: v_perm_b32 v33, v36, v61, s6 +; GFX9-NEXT: v_perm_b32 v48, v15, v51, s6 +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v36, v10, v7, s6 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v47 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v51 +; GFX9-NEXT: v_perm_b32 v51, v6, v4, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v44 +; GFX9-NEXT: v_perm_b32 v39, v37, v63, s6 +; GFX9-NEXT: v_perm_b32 v37, v11, v34, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v34 +; GFX9-NEXT: v_perm_b32 v34, v22, v29, s6 +; GFX9-NEXT: v_perm_b32 v53, v38, v40, s6 +; GFX9-NEXT: v_perm_b32 v38, v12, v9, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; GFX9-NEXT: v_perm_b32 v49, v8, v5, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v63 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; GFX9-NEXT: v_perm_b32 v40, v20, v21, s6 +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v61 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v61, v28, v25, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; GFX9-NEXT: v_perm_b32 v15, v52, v55, s6 +; GFX9-NEXT: v_perm_b32 v60, v60, v42, s6 +; GFX9-NEXT: v_perm_b32 v63, v30, v27, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v42 +; GFX9-NEXT: v_perm_b32 v42, v26, v46, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v13 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_perm_b32 v59, v0, v13, s6 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v13 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v44, v0, v13, s6 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v44 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v13 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v57, v0, v13, s6 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v56 -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[41:42] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[54:55] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[39:40] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[52:53] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[50:51] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[48:49] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v46, v0, v13, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v16 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[37:38] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v16 +; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; GFX9-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v15 +; GFX9-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[35:36] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v32 -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v32 -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[33:34] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[37:38] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v42 -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[60:61] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v42 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v38 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v38 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[35:36] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v41 -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v55 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v55 -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v35 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[48:49] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[62:63] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v48 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v48 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v51 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v51 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v50 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v50 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v53 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v53 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v40 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v40 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v34 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v33 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v63 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v63 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v62 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v61 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v60 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v60 +; GFX9-NEXT: v_lshrrev_b64 v[36:37], 24, v[39:40] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v39 +; GFX9-NEXT: v_lshrrev_b64 v[38:39], 24, v[33:34] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v42 +; GFX9-NEXT: v_lshrrev_b64 v[48:49], 24, v[50:51] +; GFX9-NEXT: v_lshrrev_b64 v[39:40], 24, v[41:42] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v42 +; GFX9-NEXT: v_lshrrev_b64 v[49:50], 24, v[62:63] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v39 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v36 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[43:44] -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v61 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v60 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v60 -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_mov_b32_e32 v34, v62 -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v52 -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v50 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; GFX9-NEXT: v_mov_b32_e32 v35, v63 -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v52 -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v48 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v63, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v62, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v43 -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v43 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v58 -; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v60 -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v61 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v60 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v41 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[58:59] +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v34 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[43:44] +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v44 +; GFX9-NEXT: v_lshrrev_b64 v[62:63], 24, v[45:46] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v45 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v45 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b64 v[31:32], 24, v[53:54] +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[60:61] +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v61 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v43 +; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[56:57] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 24, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: .LBB90_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v40 -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v10, 8, v54 -; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v32 -; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v31 -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v42 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v51 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v48 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v52 -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11 -; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v2 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v36 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v12 -; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v32, v36, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 -; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v14, 8, v14 -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v31 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v43 -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v31 -; GFX9-NEXT: v_or_b32_sdwa v18, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v48 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v35 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:68 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v42 ; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v55 -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v50 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:76 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v61 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v49 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen offset:84 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v47, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v59 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 ; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, v47, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v0, v47, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -165001,85 +164340,74 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, v47, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v0, v47, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v0, v47, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v0, v47, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, v47, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v47, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_store_dword v0, v47, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -165212,7 +164540,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr152_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr138_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16 @@ -165253,31 +164581,31 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 @@ -165287,11 +164615,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[96:97], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[13:14] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[117:118], 24, v[9:10] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[130:131], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[133:134], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[97:98], 24, v[27:28] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v15 @@ -165342,12 +164670,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v136, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 8, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v141, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[13:14] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[134:135], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[144:145], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[80:81] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[97:98], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[98:99], 24, v[25:26] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[112:113], 24, v[23:24] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[21:22] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[19:20] @@ -165922,7 +165250,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v138.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, v127.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v128, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v16 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v2, 0x7fff @@ -165932,7 +165260,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v40.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, v127.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v66.l, v182.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v152, v4, v6, vcc_lo @@ -165944,35 +165272,35 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v8 ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v152.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v138.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v153, v2, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v66.l, v182.h -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[128:129] +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v152.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[102:103] ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v154, v7, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v10, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[102:103] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[50:51] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[128:129] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[35:36] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[33:34] ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v147, v4, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, v153.h ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[133:134], 24, v[68:69] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[134:135], 24, v[66:67] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[33:34] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[131:132], 24, v[31:32] ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v146, v2, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v154.h -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[131:132], 24, v[31:32] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 24, v147 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v147 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 24, v129 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v129 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[96:97], 24, v[146:147] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[97:98], 24, v[48:49] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[97:98], 24, v[50:51] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[98:99], 24, v[48:49] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v146 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v129 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v128 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 24, v103 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v103 @@ -166066,7 +165394,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v152.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v61.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v100.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v138.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v60.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v129.h @@ -166141,7 +165469,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h ; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v98.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v163.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v79.l ; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v49.h @@ -166149,7 +165477,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v166.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v75.l ; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v97.l ; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l ; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l @@ -166263,124 +165591,123 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Spill -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:96 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x14 ; 84-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:12 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 @@ -166389,103 +165716,103 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v2 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 24, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 ; GFX11-FAKE16-NEXT: .LBB90_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB90_4 @@ -166493,754 +165820,782 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v18 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v17 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v17 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v33, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v33 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v35, 16, 1 ; GFX11-FAKE16-NEXT: v_add3_u32 v17, v17, v33, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v17, v17, v36 :: v_dual_and_b32 v18, 0xffff0000, v18 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v34, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v17, v17, v36 :: v_dual_add_f32 v18, 0x40c00000, v18 +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v35, 0x7fff ; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v18, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v18 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v18, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v77, v37, v39 :: v_dual_add_f32 v34, 0x40c00000, v34 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v20 -; GFX11-FAKE16-NEXT: v_perm_b32 v69, v77, v17, 0x7060302 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v34, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v37, v39 :: v_dual_lshlrev_b32 v37, 16, v20 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v69 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v34, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v69 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v38, v18 :: v_dual_add_f32 v20, 0x40c00000, v20 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v35, 16, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v37 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v71, v18, v17, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v37, 16, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v38, v33, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v48, v35, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v18, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v18 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v36, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v19 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v18, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX11-FAKE16-NEXT: v_perm_b32 v68, v34, v33, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v35, v37, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v19 :: v_dual_lshlrev_b32 v37, 16, v22 -; GFX11-FAKE16-NEXT: v_add3_u32 v34, v34, v20, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v20 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v68 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v68 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v34, v19, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v36, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v36 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v38, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v65, v19, v18, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v20, v39, v36, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v19 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v65 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v20, v34, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v34 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v20, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v39, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v20 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v20, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX11-FAKE16-NEXT: v_perm_b32 v64, v35, v34, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v65 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v37, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v36, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v39, 0x40c00000, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v64 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v64 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[64:65] -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v22, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v22 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v24 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v39, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v22, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v22, v48, v37, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v39 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v39, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v35, v21, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v36, v39, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v34, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v19 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v20 +; GFX11-FAKE16-NEXT: v_perm_b32 v70, v35, v33, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v20, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v34, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v35, v20, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v36, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v33 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 24, v71 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v70 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v35, v48, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v35, v49, v37, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v22 +; GFX11-FAKE16-NEXT: v_bfe_u32 v34, v39, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v71, v21, v20, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v21 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v35, v22, v35 :: v_dual_add_f32 v22, 0x40c00000, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v20 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v34, v39, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v48 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v35 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v22, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v22 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v22, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v70, v36, v35, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v38, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v38, 16, v21 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v35, 0x7fff +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v21 +; GFX11-FAKE16-NEXT: v_perm_b32 v80, v36, v34, 0x7060302 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v38, 0x40c00000, v38 :: v_dual_cndmask_b32 v21, v37, v39 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v38, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v37, v39 :: v_dual_lshlrev_b32 v39, 16, v26 -; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, 0x400000, v24 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v48, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v24, 0x7fff -; GFX11-FAKE16-NEXT: v_add3_u32 v24, v49, v38, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v48 -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v48, 0x7fff -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_cndmask_b32 v23, v36, v23 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-FAKE16-NEXT: v_bfe_u32 v35, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v81, v20, v19, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v34 +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v22, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v22 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v35, v48, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 24, v81 +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v22, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 16, v80 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v36, v49 :: v_dual_lshlrev_b32 v49, 16, v24 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v38 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 16, v70 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v23 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v24, v36, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v39 +; GFX11-FAKE16-NEXT: v_add3_u32 v36, v50, v38, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v35, v36, v37 :: v_dual_and_b32 v24, 0xffff0000, v24 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v25 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX11-FAKE16-NEXT: v_perm_b32 v81, v23, v22, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v24, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v37, v49, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v39, v50, vcc_lo +; GFX11-FAKE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v49 :: v_dual_lshlrev_b32 v39, 16, v23 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v82, v37, v35, 0x7060302 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v36, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v36 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v23 +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v24, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v36, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v39, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v24, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v49, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v38, v48, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v24, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v80, v37, v36, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v39, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v38, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v37, v37, v26, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v49, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v26, v50, v39, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v28 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v49 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v37, v25, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v37, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_perm_b32 v83, v22, v21, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v48, v36, v49, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v22 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v37, v50, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v37, v51, v39, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v26 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v49, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v36 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v80 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v37, v26, v37 :: v_dual_add_f32 v26, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v49 +; GFX11-FAKE16-NEXT: v_perm_b32 v85, v24, v23, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v27 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v26, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v38, v50, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v26 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v50 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v48, v51, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v37, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v25 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v37 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v37, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v25 +; GFX11-FAKE16-NEXT: v_perm_b32 v84, v38, v36, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v48, 0x40c00000, v48 :: v_dual_cndmask_b32 v25, v39, v49 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v37, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 16, v84 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v48, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v26, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v26 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v26, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v82, v38, v37, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v28, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v28 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v48, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v39, v49, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v28, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v50, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v28, v51, v48, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v30 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v50 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v38, v27, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v38, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v37, v50, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v82 +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v38, v26, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v38, v51, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v38, v52, v48, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 -; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v50, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX11-FAKE16-NEXT: v_perm_b32 v83, v25, v24, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v28, v38, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v49 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_perm_b32 v87, v26, v25, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v26 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v49, 16, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v28, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v51, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v49, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 16, v27 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v38, 0x40c00000, v51 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-FAKE16-NEXT: v_perm_b32 v86, v39, v37, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v38, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v38 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v27 +; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v49, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v38, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v48, v50 :: v_dual_and_b32 v28, 0xffff0000, v28 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v49 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v28, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v28 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v28, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v84, v39, v38, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v29 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v30 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v49, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v48, v50, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v30, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v51, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v30, v52, v49, 0x7fff -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v32 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v51 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v39, v29, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, 0x400000, v49 +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v39, v28, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v39, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v39, v53, v49, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v30 +; GFX11-FAKE16-NEXT: v_bfe_u32 v38, v51, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v51, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v83 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v39, v30, v39 :: v_dual_add_f32 v30, 0x40c00000, v50 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v51 +; GFX11-FAKE16-NEXT: v_perm_b32 v97, v28, v27, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v38, v51, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v31 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v30, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v48, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v30 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v39, 0x40c00000, v52 :: v_dual_cndmask_b32 v48, v50, v53 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v39, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v29 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v39 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v39, 0x7fff ; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v29 +; GFX11-FAKE16-NEXT: v_perm_b32 v96, v48, v38, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v49, v51, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v49, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-FAKE16-NEXT: v_bfe_u32 v54, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v39, v52, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v39, v52, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v30, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v30 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v30, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v86, v48, v39, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v50, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v49, v51, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v32, v53, v50, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v48, v31, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v30, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v48, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v48, v54, v50, 0x7fff ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-FAKE16-NEXT: v_perm_b32 v85, v27, v26, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v32, v48, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v51 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v32 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v39, v48, v49 :: v_dual_and_b32 v32, 0xffff0000, v32 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v51, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v32, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v32 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_cndmask_b32 v49, v51, v54 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v31 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v48, 0x40c00000, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v32 +; GFX11-FAKE16-NEXT: v_perm_b32 v98, v49, v39, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v32, 16, 1 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v31 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v48, 16, 1 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v32, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v96, v49, v48, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v2, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, 0x400000, v48 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v32, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v48, 0x7fff ; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v51, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v50, v52, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v49, v49, v2, 0x7fff -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-FAKE16-NEXT: v_bfe_u32 v32, v53, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v53, 16, 1 +; GFX11-FAKE16-NEXT: v_perm_b32 v99, v30, v29, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v50, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v51 -; GFX11-FAKE16-NEXT: v_perm_b32 v87, v29, v28, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v49, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v48, v53, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v49, v54, vcc_lo ; GFX11-FAKE16-NEXT: v_add3_u32 v49, v55, v51, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v4 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v32, v53, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v49, v50, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v49, 0x40c00000, v54 +; GFX11-FAKE16-NEXT: v_perm_b32 v101, v32, v31, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v49, 0x40c00000, v54 :: v_dual_cndmask_b32 v50, v52, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v49, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v52, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v49 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-FAKE16-NEXT: v_perm_b32 v100, v50, v48, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v1 ; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v49, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v98, v50, v32, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v4, 16, 1 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[100:101] +; GFX11-FAKE16-NEXT: v_bfe_u32 v49, v54, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v51, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v52, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, 0x400000, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[98:99] +; GFX11-FAKE16-NEXT: v_add3_u32 v53, v49, v54, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[96:97] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[86:87] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v2, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v2 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v2, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v50, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v50, v64, v52, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v4 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v54 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v49, v50, v51 :: v_dual_and_b32 v4, 0xffff0000, v4 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v50, 0x40c00000, v55 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v50, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v53, v64, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v50 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v50, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v102, v51, v49, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v4, 16, 1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v52, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v4, 0x7fff ; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v4 -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v4, 0x7fff -; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v51, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v53, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v3, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 16, v98 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 8, v98 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v50, v54, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v50, v55, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v6 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v3, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 16, v102 +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v3, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v76, 8, v102 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v51, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v51, v64, v53, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v6 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v3, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v54 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v53, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v55 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v54, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v53, 16, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v51, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v51, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX11-FAKE16-NEXT: v_perm_b32 v100, v3, v50, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v52, v64, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v53, 16, 1 +; GFX11-FAKE16-NEXT: v_dual_add_f32 v54, 0x40c00000, v54 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v112, v3, v51, 0x7060302 ; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v51, v51, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v53, 16, 1 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v5, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v53, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v54, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v5, 16, 1 ; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v52, v55, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add3_u32 v6, v66, v53, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v53 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v5, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v5 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX11-FAKE16-NEXT: v_perm_b32 v97, v31, v30, 0x7060302 -; GFX11-FAKE16-NEXT: v_perm_b32 v103, v3, v51, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v3 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v6, v54 :: v_dual_add_f32 v6, 0x40c00000, v55 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v7 +; GFX11-FAKE16-NEXT: v_add3_u32 v6, v65, v54, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v54 +; GFX11-FAKE16-NEXT: v_add3_u32 v53, v53, v5, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v64, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-FAKE16-NEXT: v_perm_b32 v113, v4, v50, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v103, v2, v1, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v6, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v7 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v6 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v52, v66, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v6, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v8 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_add_f32 v6, 0x40c00000, v64 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v53, v65, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v4 +; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v55, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v50 +; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v6, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v6 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX11-FAKE16-NEXT: v_perm_b32 v102, v5, v53, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v114, v5, v54, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v3 +; GFX11-FAKE16-NEXT: v_add3_u32 v53, v53, v6, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v113 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 8, v113 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 16, v114 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v114 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v53, v64, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v7, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v55 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v112 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v112 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 24, v103 +; GFX11-FAKE16-NEXT: v_add3_u32 v53, v53, v7, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v103 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v6, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v54, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v99, v2, v1, 0x7060302 -; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v8, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v52, v55, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v8 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v7, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v8, v67, v54, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, 0x400000, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v7 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v66, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v10 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v7, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v8, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v8, v66, v55, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v7 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v65, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v10 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v8, v55, vcc_lo -; GFX11-FAKE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v66 +; GFX11-FAKE16-NEXT: v_perm_b32 v115, v3, v52, 0x7060302 +; GFX11-FAKE16-NEXT: v_perm_b32 v43, v5, v6, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v5 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v8, v64 :: v_dual_add_f32 v8, 0x40c00000, v65 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-FAKE16-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v55, 16, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v8 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v52, v67, vcc_lo -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v8, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v53, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v8, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX11-FAKE16-NEXT: v_perm_b32 v182, v7, v54, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_perm_b32 v42, v7, v55, 0x7060302 ; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v8, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v55, 16, 1 -; GFX11-FAKE16-NEXT: v_perm_b32 v183, v5, v6, 0x7060302 +; GFX11-FAKE16-NEXT: v_add3_u32 v53, v53, v8, 0x7fff +; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v64, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v51 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 24, v43 ; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v52, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v53, v65, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: v_bfe_u32 v52, v9, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v10, v112, v55, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v67, vcc_lo -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v12 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v52, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v10, v67, v64, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v64 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 8, v43 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v42 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v66 :: v_dual_lshlrev_b32 v66, 16, v12 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-FAKE16-NEXT: v_perm_b32 v101, v4, v49, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v4 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v55, v10, v66 :: v_dual_add_f32 v10, 0x40c00000, v67 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 24, v115 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v115 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v49 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v10, 16, 1 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v112 :: v_dual_lshlrev_b32 v52, 16, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v12, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v10, v65, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v66 +; GFX11-FAKE16-NEXT: v_bfe_u32 v66, v12, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfe_u32 v65, v10, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v66, v12, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v53, v53, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v53, v67, vcc_lo +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v11 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v176, v9, v55, 0x7060302 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v52 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v66, v10, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v10 -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v67, v12, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v12 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v9, 16, 1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_cndmask_b32 v10, v52, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v12 +; GFX11-FAKE16-NEXT: v_perm_b32 v180, v9, v64, 0x7060302 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v53 +; GFX11-FAKE16-NEXT: v_add3_u32 v53, v65, v10, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v10 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-FAKE16-NEXT: v_perm_b32 v181, v7, v8, 0x7060302 +; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v9, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v55 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v53, v65, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v9 -; GFX11-FAKE16-NEXT: v_add3_u32 v52, v113, v9, 0x7fff -; GFX11-FAKE16-NEXT: v_perm_b32 v177, v7, v8, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v67, v112 :: v_dual_lshlrev_b32 v67, 16, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v112, v11, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v9 +; GFX11-FAKE16-NEXT: v_add3_u32 v53, v68, v9, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v66, v67, vcc_lo +; GFX11-FAKE16-NEXT: v_bfe_u32 v67, v11, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v26 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v52, v66 :: v_dual_add_f32 v52, 0x40c00000, v67 -; GFX11-FAKE16-NEXT: v_add3_u32 v66, v112, v11, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v13 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v52, 16, 1 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v48 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v181 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v181 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v53, v65, vcc_lo +; GFX11-FAKE16-NEXT: v_add3_u32 v65, v67, v11, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v13 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v66, v67 :: v_dual_add_f32 v66, 0x40c00000, v112 -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v52, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v52 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v14, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v66, 16, 1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v14 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 16, v180 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FAKE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v66 -; GFX11-FAKE16-NEXT: v_perm_b32 v162, v11, v9, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v67, v112, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v67, v113, v14, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v14 -; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v66, 0x7fff -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 16, v16 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfe_u32 v117, v13, 16, 1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v53, 16, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v65, v66, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v65, 0x40c00000, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v53 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v68, v53, 0x7fff +; GFX11-FAKE16-NEXT: v_perm_b32 v166, v11, v9, 0x7060302 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v116, 0x400000, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-FAKE16-NEXT: v_bfe_u32 v69, v65, 16, 1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v66, v67, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v167, v12, v10, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v64 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v14, 16, 1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v52 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 24, v167 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 16, v166 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add3_u32 v66, v68, v14, 0x7fff +; GFX11-FAKE16-NEXT: v_add3_u32 v68, v69, v65, 0x7fff +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v16 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v14 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v13, 16, 1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v67, v112 :: v_dual_add_f32 v67, 0x40c00000, v114 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 -; GFX11-FAKE16-NEXT: v_add3_u32 v112, v116, v13, 0x7fff -; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_perm_b32 v149, v14, v52, 0x7060302 -; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v67, 16, 1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v113, v115, vcc_lo -; GFX11-FAKE16-NEXT: v_or_b32_e32 v113, 0x400000, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 16, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v66, v67, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX11-FAKE16-NEXT: v_add3_u32 v67, v117, v13, 0x7fff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v163, v14, v53, 0x7060302 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v68, v116, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v13 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 16, v15 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v66, 0x40c00000, v69 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v163 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v67, v68, vcc_lo +; GFX11-FAKE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v116 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v69, v66, 16, 1 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-FAKE16-NEXT: v_perm_b32 v162, v13, v65, 0x7060302 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_bfe_u32 v117, v67, 16, 1 +; GFX11-FAKE16-NEXT: v_add3_u32 v68, v69, v66, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v69, 0x400000, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v67 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-FAKE16-NEXT: v_perm_b32 v163, v12, v10, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v112, v113 :: v_dual_add_f32 v112, 0x40c00000, v115 -; GFX11-FAKE16-NEXT: v_add3_u32 v113, v114, v67, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v67 -; GFX11-FAKE16-NEXT: v_bfe_u32 v115, v16, 16, 1 -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-FAKE16-NEXT: v_add3_u32 v117, v117, v67, 0x7fff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v68, v69, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v65 ; GFX11-FAKE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v112, 16, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v112 -; GFX11-FAKE16-NEXT: v_perm_b32 v148, v13, v66, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v113, v114, vcc_lo -; GFX11-FAKE16-NEXT: v_add3_u32 v114, v115, v16, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[180:181] +; GFX11-FAKE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 16, v162 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 8, v162 +; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v15, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v119, 0x400000, v15 +; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v16, 16, 1 ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 -; GFX11-FAKE16-NEXT: v_bfe_u32 v113, v15, 16, 1 -; GFX11-FAKE16-NEXT: v_add3_u32 v116, v116, v112, 0x7fff -; GFX11-FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v33 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v114, v115, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v112, v112 -; GFX11-FAKE16-NEXT: v_add3_u32 v113, v113, v15, 0x7fff -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[96:97] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[86:87] -; GFX11-FAKE16-NEXT: v_perm_b32 v135, v16, v67, 0x7060302 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v112, v116, v117, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 8, v180 +; GFX11-FAKE16-NEXT: v_add3_u32 v68, v68, v15, 0x7fff +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 24, v85 +; GFX11-FAKE16-NEXT: v_add3_u32 v69, v116, v16, 0x7fff +; GFX11-FAKE16-NEXT: v_or_b32_e32 v116, 0x400000, v16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v69, v116, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v39 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v117, v118, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v52 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[84:85] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v5 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v113, v118, vcc_lo -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX11-FAKE16-NEXT: v_perm_b32 v134, v15, v112, 0x7060302 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v112 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v48 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v51 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[134:135] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[148:149] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[162:163] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[176:177] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v53 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v37 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[182:183] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[82:83] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v67 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v66 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v54 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v38 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[102:103] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[98:99] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[80:81] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v55 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v39 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[100:101] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[70:71] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[68:69] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 24, v135 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v135 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v134 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v134 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 24, v149 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v149 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v148 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v148 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v163 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 8, v163 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 16, v162 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v162 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 24, v177 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v177 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v176 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 8, v176 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 24, v183 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v183 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v182 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v182 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v103 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 8, v103 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v102 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v102 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 24, v101 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v101 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v100 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v100 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 24, v99 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 8, v99 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 24, v97 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[84:85] +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v68, v119, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v146, v16, v66, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v66 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[42:43] +; GFX11-FAKE16-NEXT: v_perm_b32 v145, v15, v67, 0x7060302 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v67 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[114:115] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v53 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[145:146] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[162:163] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[166:167] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[112:113] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[102:103] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[82:83] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[80:81] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[70:71] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 24, v146 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 8, v146 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v145 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 8, v145 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 24, v163 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v167 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 8, v166 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 8, v42 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v98 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 24, v97 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v96 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v96 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 24, v87 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 24, v87 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v86 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 16, v86 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 24, v85 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v84 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v83 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v82 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v81 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v81 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 24, v71 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v77 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v70 ; GFX11-FAKE16-NEXT: .LBB90_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v76 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v76 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v75 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v68 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v63 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v75 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v72 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v60 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v68 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v178 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v66, v54 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v166 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v68, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v63 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v62 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v62 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v160 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v69, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v60 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v164 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v59 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v65, v66 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v44 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v68, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v56 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v65, v53 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v147 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v41 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v68, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v45 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v65 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v41 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v180 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v132 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v179 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v135 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v183 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v66, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v179 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v167 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v181 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v165 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v177 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v119 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v164 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v66, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v176 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v162 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v161 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v165 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xff, v118 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v149 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v65, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v129 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v54, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v66, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v68, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64 ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v64 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v145 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v151 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v148 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v144 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v116 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v135 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v146 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v134 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v145 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v131 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v134 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v129 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v133 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v132 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v74 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v72 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v73 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v53 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 @@ -167262,30 +166617,30 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v59 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v71 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v161 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v61 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v57 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v47 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v80 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v50 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v46 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v81 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v146 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v43 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v149 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v47 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v40 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v44 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v133 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v83 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v144 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v42 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 @@ -167307,29 +166662,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v182 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v128 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v176 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v180 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v82 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v86 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v163 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v167 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v83 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v117 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v151 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v163 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v103 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v96 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v115 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 @@ -167352,31 +166707,31 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v85 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v115 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v102 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v97 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v114 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v86 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v98 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v113 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v87 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v99 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v113 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v116 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v112 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v96 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v100 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v103 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v97 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v112 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v98 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v55 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v102 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 @@ -167404,29 +166759,28 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Reload -; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:88 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:92 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:96 +; GFX11-FAKE16-NEXT: s_clause 0x14 ; 84-byte Folded Reload +; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:92 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %b, 0 @@ -167451,9 +166805,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill @@ -167468,30 +166822,32 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:72 ; SI-NEXT: v_writelane_b32 v63, s30, 0 ; SI-NEXT: v_writelane_b32 v63, s31, 1 ; SI-NEXT: v_writelane_b32 v63, s34, 2 @@ -167523,507 +166879,512 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_writelane_b32 v63, s84, 28 ; SI-NEXT: v_writelane_b32 v63, s85, 29 ; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v17 ; SI-NEXT: v_writelane_b32 v63, s87, 31 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v10 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v23 ; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v25 ; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: v_writelane_b32 v63, s98, 34 ; SI-NEXT: v_writelane_b32 v63, s99, 35 ; SI-NEXT: v_mul_f32_e32 v35, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v23 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v38 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v48 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v33 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v54 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v32 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v45 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v29 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v42 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v57, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v58 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v58 ; SI-NEXT: v_mul_f32_e32 v58, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v49 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v60 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v49 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v51 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_mul_f32_e32 v49, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v41 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v43 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v44 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v45 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s20 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v53 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v54 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 ; SI-NEXT: v_mul_f32_e64 v54, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v41, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v45, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v43, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v44, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB91_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readfirstlane_b32 s4, v13 -; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v15 -; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: s_lshr_b32 s7, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v30 +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: v_readfirstlane_b32 s82, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: s_lshr_b64 s[86:87], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[98:99], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v51 +; SI-NEXT: s_lshr_b32 s87, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s86, v32 +; SI-NEXT: s_lshr_b64 s[10:11], s[86:87], 16 +; SI-NEXT: s_mov_b32 s99, s10 +; SI-NEXT: s_lshr_b64 s[4:5], s[98:99], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 4 +; SI-NEXT: v_writelane_b32 v62, s5, 5 +; SI-NEXT: s_lshr_b64 s[4:5], s[98:99], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 2 +; SI-NEXT: v_writelane_b32 v62, s5, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[98:99], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 0 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: v_writelane_b32 v62, s5, 1 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: s_lshr_b64 s[96:97], s[4:5], 16 ; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: s_lshr_b32 s65, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v41 +; SI-NEXT: s_lshr_b32 s83, s4, 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[82:83], 16 +; SI-NEXT: s_mov_b32 s97, s42 +; SI-NEXT: s_lshr_b64 s[4:5], s[96:97], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: v_writelane_b32 v62, s5, 11 +; SI-NEXT: s_lshr_b64 s[4:5], s[96:97], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_writelane_b32 v62, s5, 9 +; SI-NEXT: s_lshr_b64 s[4:5], s[96:97], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 6 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_writelane_b32 v62, s5, 7 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v42 +; SI-NEXT: v_readfirstlane_b32 s4, v18 ; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v45 +; SI-NEXT: v_readfirstlane_b32 s4, v42 ; SI-NEXT: s_lshr_b32 s69, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v43 +; SI-NEXT: v_readfirstlane_b32 s68, v16 +; SI-NEXT: s_lshr_b64 s[54:55], s[68:69], 16 +; SI-NEXT: s_mov_b32 s81, s54 +; SI-NEXT: s_lshr_b64 s[4:5], s[80:81], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: v_writelane_b32 v62, s5, 17 +; SI-NEXT: s_lshr_b64 s[4:5], s[80:81], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_writelane_b32 v62, s5, 15 +; SI-NEXT: s_lshr_b64 s[4:5], s[80:81], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v44 -; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_mov_b32_e32 v33, v35 ; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: s_lshr_b32 s91, s4, 16 -; SI-NEXT: v_mov_b32_e32 v30, v51 -; SI-NEXT: v_readfirstlane_b32 s4, v47 -; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: s_lshr_b32 s19, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s18, v32 +; SI-NEXT: s_lshr_b64 s[38:39], s[18:19], 16 +; SI-NEXT: s_mov_b32 s67, s38 +; SI-NEXT: s_lshr_b64 s[4:5], s[66:67], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 22 +; SI-NEXT: v_writelane_b32 v62, s5, 23 +; SI-NEXT: s_lshr_b64 s[4:5], s[66:67], 16 +; SI-NEXT: v_readfirstlane_b32 s64, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_writelane_b32 v62, s4, 20 +; SI-NEXT: v_mov_b32_e32 v1, v40 +; SI-NEXT: v_mov_b32_e32 v13, v3 +; SI-NEXT: v_mov_b32_e32 v35, v47 +; SI-NEXT: v_mov_b32_e32 v3, v43 +; SI-NEXT: v_mov_b32_e32 v47, v19 +; SI-NEXT: v_mov_b32_e32 v19, v22 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_writelane_b32 v62, s5, 21 +; SI-NEXT: s_lshr_b64 s[4:5], s[66:67], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 18 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: v_writelane_b32 v62, s5, 19 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v51 -; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_readfirstlane_b32 s4, v13 ; SI-NEXT: s_lshr_b64 s[52:53], s[4:5], 16 ; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: s_lshr_b32 s37, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_lshr_b32 s13, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: s_lshr_b64 s[34:35], s[12:13], 16 +; SI-NEXT: s_mov_b32 s53, s34 +; SI-NEXT: s_lshr_b64 s[4:5], s[52:53], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 28 +; SI-NEXT: v_writelane_b32 v62, s5, 29 +; SI-NEXT: s_lshr_b64 s[4:5], s[52:53], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 26 +; SI-NEXT: v_writelane_b32 v62, s5, 27 +; SI-NEXT: s_lshr_b64 s[4:5], s[52:53], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 24 +; SI-NEXT: v_readfirstlane_b32 s4, v44 +; SI-NEXT: v_writelane_b32 v62, s5, 25 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_mov_b32_e32 v41, v5 -; SI-NEXT: v_readfirstlane_b32 s4, v5 -; SI-NEXT: v_mov_b32_e32 v5, v39 -; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v5 -; SI-NEXT: s_lshr_b32 s89, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: s_lshr_b32 s65, s4, 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[64:65], 16 +; SI-NEXT: s_mov_b32 s37, s90 +; SI-NEXT: s_lshr_b64 s[4:5], s[36:37], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 34 +; SI-NEXT: v_writelane_b32 v62, s5, 35 +; SI-NEXT: s_lshr_b64 s[4:5], s[36:37], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 32 +; SI-NEXT: v_writelane_b32 v62, s5, 33 +; SI-NEXT: s_lshr_b64 s[4:5], s[36:37], 8 +; SI-NEXT: v_mov_b32_e32 v44, v41 +; SI-NEXT: v_writelane_b32 v62, s4, 30 +; SI-NEXT: v_readfirstlane_b32 s4, v44 +; SI-NEXT: v_writelane_b32 v62, s5, 31 ; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: s_lshr_b64 s[94:95], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: s_lshr_b32 s71, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s70, v41 +; SI-NEXT: s_lshr_b64 s[76:77], s[70:71], 16 +; SI-NEXT: s_mov_b32 s95, s76 +; SI-NEXT: s_lshr_b64 s[4:5], s[94:95], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 40 +; SI-NEXT: v_writelane_b32 v62, s5, 41 +; SI-NEXT: s_lshr_b64 s[4:5], s[94:95], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 38 +; SI-NEXT: v_writelane_b32 v62, s5, 39 +; SI-NEXT: s_lshr_b64 s[4:5], s[94:95], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 36 ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: s_lshr_b64 s[50:51], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_lshr_b32 s57, s4, 16 -; SI-NEXT: v_mov_b32_e32 v42, v32 -; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_writelane_b32 v62, s5, 37 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_mov_b32_e32 v6, v55 -; SI-NEXT: s_lshr_b64 s[92:93], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v6 -; SI-NEXT: s_lshr_b32 s79, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v20 -; SI-NEXT: v_mov_b32_e32 v39, v12 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: s_lshr_b32 s85, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s84, v12 +; SI-NEXT: s_lshr_b64 s[92:93], s[84:85], 16 +; SI-NEXT: s_mov_b32 s89, s92 +; SI-NEXT: s_lshr_b64 s[4:5], s[88:89], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 46 +; SI-NEXT: v_writelane_b32 v62, s5, 47 +; SI-NEXT: s_lshr_b64 s[4:5], s[88:89], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 44 +; SI-NEXT: v_writelane_b32 v62, s5, 45 +; SI-NEXT: s_lshr_b64 s[4:5], s[88:89], 8 +; SI-NEXT: v_mov_b32_e32 v2, v20 +; SI-NEXT: v_writelane_b32 v62, s4, 42 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: v_writelane_b32 v62, s5, 43 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_mov_b32_e32 v9, v8 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v43, v20 -; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: v_readfirstlane_b32 s4, v47 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v19 ; SI-NEXT: v_mov_b32_e32 v20, v21 -; SI-NEXT: v_readfirstlane_b32 s78, v18 ; SI-NEXT: s_lshr_b32 s73, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v20 -; SI-NEXT: v_mov_b32_e32 v18, v22 +; SI-NEXT: v_readfirstlane_b32 s72, v20 +; SI-NEXT: s_lshr_b64 s[62:63], s[72:73], 16 +; SI-NEXT: s_mov_b32 s75, s62 +; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 52 +; SI-NEXT: v_writelane_b32 v62, s5, 53 +; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 50 +; SI-NEXT: v_writelane_b32 v62, s5, 51 +; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 48 +; SI-NEXT: v_readfirstlane_b32 s4, v24 +; SI-NEXT: v_writelane_b32 v62, s5, 49 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v18 -; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16 ; SI-NEXT: v_readfirstlane_b32 s4, v23 ; SI-NEXT: s_lshr_b32 s59, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v28 -; SI-NEXT: v_mov_b32_e32 v21, v25 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s58, v14 +; SI-NEXT: s_lshr_b64 s[78:79], s[58:59], 16 +; SI-NEXT: s_mov_b32 s61, s78 +; SI-NEXT: s_lshr_b64 s[4:5], s[60:61], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 58 +; SI-NEXT: v_writelane_b32 v62, s5, 59 +; SI-NEXT: s_lshr_b64 s[4:5], s[60:61], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 56 +; SI-NEXT: v_writelane_b32 v62, s5, 57 +; SI-NEXT: s_lshr_b64 s[4:5], s[60:61], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 54 +; SI-NEXT: v_readfirstlane_b32 s4, v26 +; SI-NEXT: v_writelane_b32 v62, s5, 55 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v27 -; SI-NEXT: v_mov_b32_e32 v25, v26 +; SI-NEXT: v_readfirstlane_b32 s4, v28 ; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 ; SI-NEXT: v_readfirstlane_b32 s4, v25 -; SI-NEXT: v_mov_b32_e32 v12, v29 ; SI-NEXT: s_lshr_b32 s45, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v12 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: v_readfirstlane_b32 s44, v30 +; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16 +; SI-NEXT: s_mov_b32 s47, s56 +; SI-NEXT: s_lshr_b64 s[4:5], s[46:47], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 0 +; SI-NEXT: v_writelane_b32 v61, s5, 1 +; SI-NEXT: s_lshr_b64 s[4:5], s[46:47], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 62 +; SI-NEXT: v_writelane_b32 v62, s5, 63 +; SI-NEXT: s_lshr_b64 s[4:5], s[46:47], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 60 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: v_writelane_b32 v62, s5, 61 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_mov_b32_e32 v1, v52 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v52, v17 -; SI-NEXT: v_readfirstlane_b32 s4, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v22 ; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 ; SI-NEXT: v_readfirstlane_b32 s4, v29 ; SI-NEXT: s_lshr_b32 s29, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v11 -; SI-NEXT: v_readfirstlane_b32 s12, v40 -; SI-NEXT: s_lshr_b64 s[96:97], s[6:7], 16 -; SI-NEXT: s_mov_b32 s9, s96 -; SI-NEXT: v_readfirstlane_b32 s88, v60 -; SI-NEXT: s_lshr_b64 s[82:83], s[88:89], 16 -; SI-NEXT: v_readfirstlane_b32 s64, v16 -; SI-NEXT: s_lshr_b64 s[84:85], s[64:65], 16 -; SI-NEXT: s_mov_b32 s87, s84 -; SI-NEXT: v_readfirstlane_b32 s68, v48 -; SI-NEXT: s_lshr_b64 s[70:71], s[68:69], 16 -; SI-NEXT: s_mov_b32 s81, s70 -; SI-NEXT: v_readfirstlane_b32 s90, v30 -; SI-NEXT: s_lshr_b64 s[38:39], s[90:91], 16 -; SI-NEXT: s_mov_b32 s67, s38 -; SI-NEXT: v_readfirstlane_b32 s36, v3 -; SI-NEXT: s_lshr_b64 s[98:99], s[36:37], 16 -; SI-NEXT: s_mov_b32 s53, s98 -; SI-NEXT: s_mov_b32 s31, s82 -; SI-NEXT: v_readfirstlane_b32 s56, v7 -; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 -; SI-NEXT: s_mov_b32 s51, s94 -; SI-NEXT: s_lshr_b64 s[74:75], s[78:79], 16 -; SI-NEXT: s_mov_b32 s93, s74 -; SI-NEXT: v_readfirstlane_b32 s72, v19 -; SI-NEXT: s_lshr_b64 s[60:61], s[72:73], 16 -; SI-NEXT: s_mov_b32 s77, s60 -; SI-NEXT: v_readfirstlane_b32 s58, v21 -; SI-NEXT: s_lshr_b64 s[54:55], s[58:59], 16 -; SI-NEXT: s_mov_b32 s63, s54 -; SI-NEXT: v_readfirstlane_b32 s44, v22 -; SI-NEXT: s_lshr_b64 s[42:43], s[44:45], 16 -; SI-NEXT: s_mov_b32 s47, s42 -; SI-NEXT: v_mov_b32_e32 v26, v37 -; SI-NEXT: v_readfirstlane_b32 s28, v26 -; SI-NEXT: s_lshr_b64 s[26:27], s[28:29], 16 -; SI-NEXT: s_mov_b32 s41, s26 -; SI-NEXT: v_readfirstlane_b32 s22, v36 -; SI-NEXT: v_readfirstlane_b32 s18, v49 -; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 -; SI-NEXT: v_mov_b32_e32 v1, v56 -; SI-NEXT: v_mov_b32_e32 v3, v54 -; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v50 -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v38 -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v54, v59 -; SI-NEXT: s_lshr_b32 s78, s96, 8 -; SI-NEXT: s_lshr_b32 s61, s84, 8 -; SI-NEXT: s_lshr_b32 s72, s70, 8 -; SI-NEXT: s_lshr_b32 s75, s38, 8 -; SI-NEXT: s_lshr_b32 s58, s98, 8 -; SI-NEXT: s_lshr_b32 s43, s82, 8 -; SI-NEXT: s_lshr_b32 s44, s94, 8 -; SI-NEXT: s_mov_b32 s64, s74 -; SI-NEXT: s_lshr_b32 s27, s74, 8 -; SI-NEXT: s_mov_b32 s90, s60 -; SI-NEXT: s_lshr_b32 s28, s60, 8 -; SI-NEXT: s_lshr_b32 s74, s54, 8 -; SI-NEXT: s_mov_b32 s68, s42 -; SI-NEXT: s_mov_b32 s56, s26 -; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v3 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v45 -; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v34 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v35 -; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v9 -; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v25 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_readfirstlane_b32 s28, v27 +; SI-NEXT: s_lshr_b64 s[50:51], s[28:29], 16 +; SI-NEXT: s_mov_b32 s41, s50 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 6 +; SI-NEXT: v_writelane_b32 v61, s5, 7 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v61, s4, 4 +; SI-NEXT: v_writelane_b32 v61, s5, 5 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 2 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: v_writelane_b32 v61, s5, 3 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s4, v33 -; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v55 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: s_lshr_b32 s25, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s24, v48 +; SI-NEXT: s_lshr_b64 s[30:31], s[24:25], 16 +; SI-NEXT: s_mov_b32 s23, s30 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 12 +; SI-NEXT: v_writelane_b32 v61, s5, 13 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: v_writelane_b32 v61, s4, 10 +; SI-NEXT: v_writelane_b32 v61, s5, 11 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 8 +; SI-NEXT: v_readfirstlane_b32 s4, v40 +; SI-NEXT: v_writelane_b32 v61, s5, 9 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v43 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 ; SI-NEXT: v_readfirstlane_b32 s4, v57 -; SI-NEXT: s_lshr_b32 s23, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v45 +; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 16 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: v_writelane_b32 v61, s4, 20 +; SI-NEXT: s_mov_b32 s17, s14 +; SI-NEXT: v_writelane_b32 v61, s5, 21 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 18 +; SI-NEXT: v_writelane_b32 v61, s5, 19 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 +; SI-NEXT: v_writelane_b32 v61, s4, 16 +; SI-NEXT: v_writelane_b32 v61, s5, 17 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 14 ; SI-NEXT: v_readfirstlane_b32 s4, v58 +; SI-NEXT: v_writelane_b32 v61, s5, 15 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v59 -; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: s_lshr_b32 s19, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s4, v24 -; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: s_lshr_b32 s13, s4, 16 -; SI-NEXT: s_mov_b32 s5, s13 -; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 ; SI-NEXT: v_readfirstlane_b32 s4, v46 +; SI-NEXT: s_lshr_b32 s49, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s48, v50 +; SI-NEXT: s_lshr_b64 s[20:21], s[48:49], 16 +; SI-NEXT: s_mov_b32 s9, s20 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 26 ; SI-NEXT: v_writelane_b32 v61, s5, 27 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v61, s4, 24 +; SI-NEXT: v_writelane_b32 v61, s5, 25 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 22 +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: v_writelane_b32 v61, s5, 23 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v10 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s5, v56 -; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 -; SI-NEXT: s_lshr_b32 s13, s5, 16 -; SI-NEXT: v_readfirstlane_b32 s12, v14 -; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 -; SI-NEXT: s_mov_b32 s5, vcc_lo -; SI-NEXT: s_mov_b32 s88, vcc_lo -; SI-NEXT: s_lshr_b32 s6, vcc_lo, 8 -; SI-NEXT: s_lshr_b64 vcc, s[8:9], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 4 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 5 -; SI-NEXT: s_lshr_b64 vcc, s[8:9], 16 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 2 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 3 -; SI-NEXT: s_lshr_b64 vcc, s[8:9], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 0 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 1 -; SI-NEXT: s_lshr_b64 vcc, s[86:87], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 10 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 11 -; SI-NEXT: s_lshr_b64 vcc, s[86:87], 16 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 8 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 9 -; SI-NEXT: s_lshr_b64 vcc, s[86:87], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 6 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 7 -; SI-NEXT: s_lshr_b64 vcc, s[80:81], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 16 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 17 -; SI-NEXT: s_lshr_b64 vcc, s[80:81], 16 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 14 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 15 -; SI-NEXT: s_lshr_b64 vcc, s[80:81], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 12 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 13 -; SI-NEXT: s_lshr_b64 vcc, s[66:67], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 22 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 23 -; SI-NEXT: s_lshr_b64 vcc, s[66:67], 16 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 20 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 21 -; SI-NEXT: s_lshr_b64 vcc, s[66:67], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 18 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 19 -; SI-NEXT: s_lshr_b64 vcc, s[52:53], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 28 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 29 -; SI-NEXT: s_lshr_b64 vcc, s[52:53], 16 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 26 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 27 -; SI-NEXT: s_lshr_b64 vcc, s[52:53], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 24 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 25 -; SI-NEXT: s_lshr_b64 vcc, s[30:31], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 34 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 35 -; SI-NEXT: s_lshr_b64 vcc, s[30:31], 16 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 32 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 33 -; SI-NEXT: s_lshr_b64 vcc, s[30:31], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 30 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 31 -; SI-NEXT: s_lshr_b64 vcc, s[50:51], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 40 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 41 -; SI-NEXT: s_lshr_b64 vcc, s[50:51], 16 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 38 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 39 -; SI-NEXT: s_lshr_b64 vcc, s[50:51], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 36 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 37 -; SI-NEXT: s_lshr_b64 vcc, s[92:93], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 46 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 -; SI-NEXT: s_lshr_b64 vcc, s[92:93], 16 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 44 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 45 -; SI-NEXT: s_lshr_b64 vcc, s[92:93], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 42 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 43 -; SI-NEXT: s_lshr_b64 vcc, s[76:77], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 52 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 53 -; SI-NEXT: s_lshr_b64 vcc, s[76:77], 16 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 50 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 51 -; SI-NEXT: s_lshr_b64 vcc, s[76:77], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 48 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 49 -; SI-NEXT: s_lshr_b64 vcc, s[62:63], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 58 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 59 -; SI-NEXT: s_lshr_b64 vcc, s[62:63], 16 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 56 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 57 -; SI-NEXT: s_lshr_b64 vcc, s[62:63], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 54 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 55 -; SI-NEXT: s_lshr_b64 vcc, s[46:47], 24 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 0 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 1 -; SI-NEXT: s_lshr_b64 vcc, s[46:47], 16 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 62 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 63 -; SI-NEXT: s_lshr_b64 vcc, s[46:47], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 60 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 61 -; SI-NEXT: s_lshr_b64 vcc, s[40:41], 24 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 6 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 7 -; SI-NEXT: s_lshr_b64 vcc, s[40:41], 16 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 4 -; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 16 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 5 -; SI-NEXT: s_lshr_b64 vcc, s[40:41], 8 -; SI-NEXT: s_mov_b32 s25, s34 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 2 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 3 -; SI-NEXT: s_lshr_b64 vcc, s[24:25], 24 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 12 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 13 -; SI-NEXT: s_lshr_b64 vcc, s[24:25], 16 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 10 -; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 11 -; SI-NEXT: s_lshr_b64 vcc, s[24:25], 8 -; SI-NEXT: s_mov_b32 s17, s14 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 8 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 9 -; SI-NEXT: s_lshr_b64 vcc, s[16:17], 24 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 18 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 19 -; SI-NEXT: s_lshr_b64 vcc, s[16:17], 16 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 16 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 17 -; SI-NEXT: s_lshr_b64 vcc, s[16:17], 8 -; SI-NEXT: s_mov_b32 s11, s20 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 14 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 15 -; SI-NEXT: s_lshr_b64 vcc, s[10:11], 24 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 24 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 25 -; SI-NEXT: s_lshr_b64 vcc, s[10:11], 16 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 22 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 23 -; SI-NEXT: s_lshr_b64 vcc, s[10:11], 8 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 20 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 21 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 32 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 33 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 16 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 30 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 31 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 -; SI-NEXT: v_writelane_b32 v61, vcc_lo, 28 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v10, 24, v23 -; SI-NEXT: s_lshr_b32 s22, s42, 8 -; SI-NEXT: s_lshr_b32 s21, s26, 8 -; SI-NEXT: s_lshr_b32 s18, s34, 8 -; SI-NEXT: s_mov_b32 s36, s14 -; SI-NEXT: s_lshr_b32 s15, s14, 8 +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v56 +; SI-NEXT: s_lshr_b32 s27, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s26, v10 +; SI-NEXT: s_lshr_b64 vcc, s[26:27], 16 +; SI-NEXT: s_mov_b32 s5, s27 +; SI-NEXT: v_writelane_b32 v61, s4, 34 +; SI-NEXT: s_mov_b32 s7, vcc_lo +; SI-NEXT: v_writelane_b32 v61, s5, 35 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 32 +; SI-NEXT: v_writelane_b32 v61, s5, 33 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v61, s4, 30 +; SI-NEXT: v_mov_b32_e32 v4, v51 +; SI-NEXT: v_writelane_b32 v61, s5, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], 8 +; SI-NEXT: v_mov_b32_e32 v5, v42 +; SI-NEXT: v_mov_b32_e32 v52, v11 +; SI-NEXT: v_mov_b32_e32 v53, v49 +; SI-NEXT: v_writelane_b32 v61, s4, 28 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v25 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v26, v15 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: v_mov_b32_e32 v60, v7 +; SI-NEXT: v_mov_b32_e32 v51, v45 +; SI-NEXT: v_writelane_b32 v61, s5, 29 +; SI-NEXT: s_lshr_b32 s44, s10, 8 +; SI-NEXT: s_lshr_b32 s43, s42, 8 +; SI-NEXT: s_lshr_b32 s41, s54, 8 +; SI-NEXT: s_lshr_b32 s28, s38, 8 +; SI-NEXT: s_lshr_b32 s61, s34, 8 +; SI-NEXT: s_lshr_b32 s24, s90, 8 +; SI-NEXT: s_lshr_b32 s23, s76, 8 +; SI-NEXT: s_lshr_b32 s21, s92, 8 +; SI-NEXT: s_lshr_b32 s18, s62, 8 +; SI-NEXT: s_lshr_b32 s17, s78, 8 +; SI-NEXT: s_mov_b32 s64, s56 +; SI-NEXT: s_lshr_b32 s15, s56, 8 +; SI-NEXT: s_lshr_b32 s12, s50, 8 +; SI-NEXT: s_lshr_b32 s11, s30, 8 +; SI-NEXT: s_mov_b32 s48, s14 +; SI-NEXT: s_lshr_b32 s9, s14, 8 ; SI-NEXT: s_mov_b32 s14, s20 -; SI-NEXT: s_lshr_b32 s12, s20, 8 -; SI-NEXT: v_writelane_b32 v61, vcc_hi, 29 +; SI-NEXT: s_lshr_b32 s7, s20, 8 +; SI-NEXT: s_mov_b32 s72, vcc_lo +; SI-NEXT: s_lshr_b32 s4, vcc_lo, 8 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v29 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v57 -; SI-NEXT: v_mov_b32_e32 v59, v30 -; SI-NEXT: v_mov_b32_e32 v31, v51 -; SI-NEXT: v_mov_b32_e32 v60, v34 -; SI-NEXT: v_mov_b32_e32 v30, v39 -; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v39, v21 -; SI-NEXT: v_mov_b32_e32 v21, v20 -; SI-NEXT: v_mov_b32_e32 v34, v18 -; SI-NEXT: v_mov_b32_e32 v18, v37 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v7, v26 +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v54 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v33 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v39 +; SI-NEXT: v_lshrrev_b32_e32 v21, 24, v8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v29 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v38 +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v57 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v46 +; SI-NEXT: v_mov_b32_e32 v10, v56 +; SI-NEXT: v_mov_b32_e32 v5, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v56 +; SI-NEXT: v_mov_b32_e32 v39, v32 +; SI-NEXT: v_mov_b32_e32 v55, v1 +; SI-NEXT: v_mov_b32_e32 v45, v13 +; SI-NEXT: v_mov_b32_e32 v13, v49 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: v_mov_b32_e32 v32, v44 +; SI-NEXT: v_mov_b32_e32 v44, v35 +; SI-NEXT: v_mov_b32_e32 v56, v12 +; SI-NEXT: v_mov_b32_e32 v12, v11 +; SI-NEXT: v_mov_b32_e32 v11, v31 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: v_mov_b32_e32 v31, v20 ; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_mov_b32_e32 v37, v17 -; SI-NEXT: v_mov_b32_e32 v51, v33 -; SI-NEXT: v_mov_b32_e32 v17, v9 -; SI-NEXT: v_mov_b32_e32 v9, v10 -; SI-NEXT: v_mov_b32_e32 v26, v25 +; SI-NEXT: v_mov_b32_e32 v3, v47 +; SI-NEXT: v_mov_b32_e32 v41, v19 ; SI-NEXT: s_branch .LBB91_3 ; SI-NEXT: .LBB91_2: ; SI-NEXT: ; implicit-def: $sgpr4 @@ -168049,21 +167410,27 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_mov_b32_e32 v52, v11 ; SI-NEXT: v_writelane_b32 v62, s5, 15 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v54, v59 +; SI-NEXT: v_mov_b32_e32 v11, v31 +; SI-NEXT: v_mov_b32_e32 v31, v21 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v62, s4, 16 -; SI-NEXT: v_mov_b32_e32 v59, v51 -; SI-NEXT: v_mov_b32_e32 v31, v46 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v7, v37 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v40 +; SI-NEXT: v_mov_b32_e32 v54, v43 +; SI-NEXT: v_mov_b32_e32 v32, v41 +; SI-NEXT: v_mov_b32_e32 v41, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v10, v56 ; SI-NEXT: v_writelane_b32 v62, s5, 17 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v62, s4, 18 ; SI-NEXT: v_writelane_b32 v62, s5, 19 @@ -168162,8 +167529,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s4, 18 ; SI-NEXT: v_writelane_b32 v61, s5, 19 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s4, 20 +; SI-NEXT: v_writelane_b32 v62, s5, 63 +; SI-NEXT: ; implicit-def: $sgpr5 ; SI-NEXT: v_writelane_b32 v61, s5, 21 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s4, 22 @@ -168171,1069 +167539,977 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s4, 24 ; SI-NEXT: v_writelane_b32 v61, s5, 25 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s4, 26 -; SI-NEXT: v_writelane_b32 v62, s5, 63 -; SI-NEXT: ; implicit-def: $sgpr5 ; SI-NEXT: v_writelane_b32 v61, s5, 27 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: v_writelane_b32 v61, s20, 28 -; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: v_writelane_b32 v61, s21, 29 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: v_writelane_b32 v61, s20, 30 -; SI-NEXT: v_writelane_b32 v61, s21, 31 -; SI-NEXT: v_mov_b32_e32 v44, v1 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: v_writelane_b32 v61, s88, 32 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v52, v17 -; SI-NEXT: v_mov_b32_e32 v43, v20 -; SI-NEXT: v_mov_b32_e32 v42, v32 -; SI-NEXT: v_mov_b32_e32 v41, v5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 28 +; SI-NEXT: v_writelane_b32 v61, s5, 29 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 30 +; SI-NEXT: v_writelane_b32 v61, s5, 31 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 32 +; SI-NEXT: v_writelane_b32 v61, s5, 33 +; SI-NEXT: v_mov_b32_e32 v26, v15 +; SI-NEXT: v_mov_b32_e32 v53, v49 +; SI-NEXT: v_mov_b32_e32 v51, v45 +; SI-NEXT: v_mov_b32_e32 v60, v7 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s4, 34 ; SI-NEXT: s_mov_b64 vcc, -1 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: v_writelane_b32 v61, s89, 33 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v35, v6 -; SI-NEXT: v_mov_b32_e32 v32, v4 -; SI-NEXT: v_mov_b32_e32 v30, v12 -; SI-NEXT: v_mov_b32_e32 v19, v39 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v39, v25 -; SI-NEXT: v_mov_b32_e32 v12, v29 -; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_mov_b32_e32 v6, v55 -; SI-NEXT: v_mov_b32_e32 v17, v8 -; SI-NEXT: v_mov_b32_e32 v29, v33 -; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: v_mov_b32_e32 v5, v8 +; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: v_mov_b32_e32 v45, v3 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v35, v26 +; SI-NEXT: v_mov_b32_e32 v3, v19 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr87 ; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr83 ; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr27 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; implicit-def: $sgpr59 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: v_writelane_b32 v61, s5, 35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: .LBB91_3: ; %Flow -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v47, v58 +; SI-NEXT: v_mov_b32_e32 v2, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, vcc ; SI-NEXT: s_cbranch_vccnz .LBB91_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s4, v11 -; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s12, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_readfirstlane_b32 s6, v9 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_readfirstlane_b32 s5, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v40 +; SI-NEXT: v_writelane_b32 v61, s4, 34 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 16 -; SI-NEXT: v_readfirstlane_b32 s6, v8 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v54 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 -; SI-NEXT: s_lshr_b64 s[20:21], s[8:9], 16 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 -; SI-NEXT: v_writelane_b32 v61, s6, 26 -; SI-NEXT: v_readfirstlane_b32 s6, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 -; SI-NEXT: s_mov_b32 s7, s9 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_readfirstlane_b32 s18, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37 -; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 -; SI-NEXT: v_readfirstlane_b32 s6, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_lshr_b32 s19, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v3 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v21 +; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_writelane_b32 v61, s7, 27 -; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 -; SI-NEXT: s_mov_b32 s17, s26 -; SI-NEXT: s_mov_b32 s11, s20 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v4 ; SI-NEXT: v_readfirstlane_b32 s8, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 -; SI-NEXT: v_readfirstlane_b32 s22, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; SI-NEXT: v_readfirstlane_b32 s6, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_lshr_b32 s23, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v5 -; SI-NEXT: v_readfirstlane_b32 s28, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v27 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v28 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v4 +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: v_readfirstlane_b32 s48, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v43 +; SI-NEXT: s_lshr_b32 s11, s5, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_readfirstlane_b32 s10, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: s_lshr_b32 s49, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v57 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 +; SI-NEXT: s_lshr_b32 s11, s4, 16 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_lshr_b32 s29, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v7 -; SI-NEXT: v_readfirstlane_b32 s44, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 +; SI-NEXT: v_readfirstlane_b32 s10, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v61, s5, 35 +; SI-NEXT: v_writelane_b32 v61, s6, 20 +; SI-NEXT: v_writelane_b32 v61, s7, 21 +; SI-NEXT: s_mov_b32 s17, s56 +; SI-NEXT: s_lshr_b32 s9, s9, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[48:49], 16 +; SI-NEXT: s_mov_b32 s9, s20 +; SI-NEXT: s_mov_b32 s7, vcc_lo +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_mov_b32 s48, s56 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v38 +; SI-NEXT: v_readfirstlane_b32 s24, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v39 -; SI-NEXT: v_readfirstlane_b32 s6, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_lshr_b32 s45, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v9 -; SI-NEXT: v_readfirstlane_b32 s58, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_lshr_b64 s[62:63], s[8:9], 16 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: v_readfirstlane_b32 s10, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v27 +; SI-NEXT: s_lshr_b32 s25, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v29 +; SI-NEXT: v_readfirstlane_b32 s28, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v28 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_readfirstlane_b32 s6, v9 -; SI-NEXT: s_lshr_b32 s59, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v10 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v42 -; SI-NEXT: v_readfirstlane_b32 s6, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: s_lshr_b32 s73, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v11 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v18 -; SI-NEXT: s_lshr_b32 s79, s6, 16 -; SI-NEXT: s_lshr_b64 s[54:55], s[58:59], 16 -; SI-NEXT: s_mov_b32 s63, s54 -; SI-NEXT: s_lshr_b64 s[60:61], s[44:45], 16 -; SI-NEXT: s_mov_b32 s47, s60 -; SI-NEXT: s_lshr_b64 s[42:43], s[28:29], 16 -; SI-NEXT: s_mov_b32 s41, s42 -; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 16 -; SI-NEXT: s_mov_b32 s25, s34 -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: s_lshr_b32 s13, s5, 16 -; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 -; SI-NEXT: s_mov_b32 s5, vcc_lo -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v8 -; SI-NEXT: s_lshr_b32 s22, s60, 8 -; SI-NEXT: s_lshr_b32 s21, s42, 8 -; SI-NEXT: s_lshr_b32 s18, s34, 8 -; SI-NEXT: s_lshr_b32 s12, s20, 8 -; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v18 -; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s72, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_lshr_b64 s[92:93], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[72:73], 16 -; SI-NEXT: s_mov_b32 s77, s74 -; SI-NEXT: s_lshr_b32 s28, s74, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s78, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_lshr_b64 s[48:49], s[78:79], 16 -; SI-NEXT: s_mov_b32 s93, s48 -; SI-NEXT: s_lshr_b32 s27, s48, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_readfirstlane_b32 s6, v11 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s56, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s6, v3 -; SI-NEXT: s_lshr_b32 s57, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v13 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v19 -; SI-NEXT: s_lshr_b64 s[30:31], s[8:9], 16 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s6, v24 -; SI-NEXT: s_lshr_b32 s89, s6, 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 -; SI-NEXT: s_mov_b32 s51, s94 -; SI-NEXT: s_lshr_b32 s44, s94, 8 -; SI-NEXT: s_mov_b32 s56, s42 -; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_readfirstlane_b32 s88, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_lshr_b64 s[82:83], s[88:89], 16 -; SI-NEXT: s_mov_b32 s31, s82 -; SI-NEXT: s_lshr_b32 s43, s82, 8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_readfirstlane_b32 s6, v13 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[8:9], 16 -; SI-NEXT: v_readfirstlane_b32 s6, v4 -; SI-NEXT: s_lshr_b32 s37, s6, 16 -; SI-NEXT: s_mov_b32 s88, vcc_lo -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_readfirstlane_b32 s36, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v59 -; SI-NEXT: s_lshr_b64 s[98:99], s[36:37], 16 -; SI-NEXT: s_mov_b32 s53, s98 -; SI-NEXT: s_lshr_b32 s58, s98, 8 -; SI-NEXT: s_mov_b32 s36, s26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[40:41], s[10:11], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: v_readfirstlane_b32 s10, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v42 +; SI-NEXT: s_lshr_b32 s29, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v25 +; SI-NEXT: v_readfirstlane_b32 s44, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v34 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: v_readfirstlane_b32 s10, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v14 +; SI-NEXT: s_lshr_b32 s45, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_readfirstlane_b32 s58, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_readfirstlane_b32 s6, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v11 -; SI-NEXT: v_readfirstlane_b32 s90, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: s_lshr_b64 s[66:67], s[8:9], 16 -; SI-NEXT: v_readfirstlane_b32 s6, v11 -; SI-NEXT: s_lshr_b32 s91, s6, 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[90:91], 16 -; SI-NEXT: s_mov_b32 s67, s38 -; SI-NEXT: s_lshr_b32 s75, s38, 8 -; SI-NEXT: s_mov_b32 s90, s74 -; SI-NEXT: s_lshr_b32 s74, s54, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_readfirstlane_b32 s10, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v31 +; SI-NEXT: s_lshr_b32 s59, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v41 +; SI-NEXT: v_readfirstlane_b32 s72, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v35 +; SI-NEXT: s_lshr_b32 s11, s4, 16 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_readfirstlane_b32 s8, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_readfirstlane_b32 s6, v15 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: s_lshr_b64 s[80:81], s[8:9], 16 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v56 +; SI-NEXT: s_lshr_b32 s73, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: s_lshr_b32 s85, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_readfirstlane_b32 s84, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[88:89], s[10:11], 16 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v21 +; SI-NEXT: s_lshr_b32 s71, s4, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: s_lshr_b64 s[92:93], s[84:85], 16 +; SI-NEXT: s_mov_b32 s89, s92 +; SI-NEXT: s_lshr_b64 s[62:63], s[72:73], 16 +; SI-NEXT: s_mov_b32 s75, s62 +; SI-NEXT: s_lshr_b64 s[78:79], s[58:59], 16 +; SI-NEXT: s_mov_b32 s61, s78 +; SI-NEXT: s_lshr_b64 s[26:27], s[44:45], 16 +; SI-NEXT: s_mov_b32 s47, s26 +; SI-NEXT: s_lshr_b64 s[50:51], s[28:29], 16 +; SI-NEXT: s_mov_b32 s41, s50 +; SI-NEXT: s_lshr_b64 s[30:31], s[24:25], 16 +; SI-NEXT: s_mov_b32 s23, s30 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_lshr_b32 s21, s92, 8 +; SI-NEXT: s_mov_b32 s72, vcc_lo +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_readfirstlane_b32 s10, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_readfirstlane_b32 s70, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: s_lshr_b64 s[94:95], s[10:11], 16 +; SI-NEXT: v_readfirstlane_b32 s10, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 -; SI-NEXT: v_readfirstlane_b32 s68, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_readfirstlane_b32 s6, v13 -; SI-NEXT: s_lshr_b32 s69, s6, 16 -; SI-NEXT: s_lshr_b64 s[70:71], s[68:69], 16 -; SI-NEXT: s_mov_b32 s81, s70 -; SI-NEXT: s_lshr_b32 s72, s70, 8 -; SI-NEXT: s_mov_b32 s68, s60 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_readfirstlane_b32 s64, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v45 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: s_lshr_b64 s[36:37], s[10:11], 16 +; SI-NEXT: v_readfirstlane_b32 s10, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: s_lshr_b64 s[76:77], s[70:71], 16 +; SI-NEXT: s_mov_b32 s95, s76 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_readfirstlane_b32 s8, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: s_lshr_b32 s65, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[10:11], 16 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_lshr_b32 s13, s4, 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[64:65], 16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_readfirstlane_b32 s12, v16 +; SI-NEXT: s_lshr_b64 s[34:35], s[12:13], 16 +; SI-NEXT: s_mov_b32 s53, s34 +; SI-NEXT: s_mov_b32 s37, s90 +; SI-NEXT: s_lshr_b32 s24, s90, 8 +; SI-NEXT: s_mov_b32 s64, s26 +; SI-NEXT: s_lshr_b32 s12, s50, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_readfirstlane_b32 s10, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_readfirstlane_b32 s18, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[10:11], 16 +; SI-NEXT: v_readfirstlane_b32 s10, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: s_lshr_b32 s19, s4, 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[18:19], 16 +; SI-NEXT: s_mov_b32 s67, s38 +; SI-NEXT: s_lshr_b64 s[14:15], s[66:67], 24 +; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v9 +; SI-NEXT: s_lshr_b32 s28, s38, 8 +; SI-NEXT: s_lshr_b32 s18, s62, 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_readfirstlane_b32 s64, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_readfirstlane_b32 s6, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: s_lshr_b64 s[86:87], s[8:9], 16 -; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: v_readfirstlane_b32 s68, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: s_lshr_b64 s[80:81], s[10:11], 16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: s_lshr_b32 s65, s6, 16 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: s_lshr_b64 s[84:85], s[64:65], 16 -; SI-NEXT: s_mov_b32 s87, s84 -; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v6 -; SI-NEXT: s_lshr_b32 s61, s84, 8 -; SI-NEXT: s_mov_b32 s64, s48 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s10, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_readfirstlane_b32 s8, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: s_lshr_b32 s69, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: s_lshr_b64 s[96:97], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[68:69], 16 +; SI-NEXT: s_mov_b32 s81, s54 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_readfirstlane_b32 s6, v16 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v6 -; SI-NEXT: s_lshr_b32 s7, s6, 16 -; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v7 -; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v12 +; SI-NEXT: v_readfirstlane_b32 s82, v16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: s_lshr_b32 s83, s4, 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[82:83], 16 +; SI-NEXT: s_mov_b32 s97, s42 +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v11 +; SI-NEXT: s_lshr_b32 s43, s42, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_readfirstlane_b32 s10, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_readfirstlane_b32 s6, v15 -; SI-NEXT: s_lshr_b64 s[96:97], s[6:7], 16 -; SI-NEXT: s_mov_b32 s9, s96 -; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 24 -; SI-NEXT: v_writelane_b32 v62, s14, 4 -; SI-NEXT: v_writelane_b32 v62, s15, 5 -; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 -; SI-NEXT: v_writelane_b32 v62, s14, 2 -; SI-NEXT: v_writelane_b32 v62, s15, 3 -; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 8 -; SI-NEXT: v_writelane_b32 v62, s14, 0 -; SI-NEXT: v_writelane_b32 v62, s15, 1 -; SI-NEXT: s_lshr_b64 s[14:15], s[86:87], 24 -; SI-NEXT: v_writelane_b32 v62, s14, 10 -; SI-NEXT: v_writelane_b32 v62, s15, 11 -; SI-NEXT: s_lshr_b64 s[14:15], s[86:87], 16 -; SI-NEXT: v_writelane_b32 v62, s14, 8 -; SI-NEXT: v_writelane_b32 v62, s15, 9 -; SI-NEXT: s_lshr_b64 s[14:15], s[86:87], 8 -; SI-NEXT: v_writelane_b32 v62, s14, 6 -; SI-NEXT: v_writelane_b32 v62, s15, 7 -; SI-NEXT: s_lshr_b64 s[14:15], s[80:81], 24 -; SI-NEXT: v_writelane_b32 v62, s14, 16 -; SI-NEXT: v_writelane_b32 v62, s15, 17 -; SI-NEXT: s_lshr_b64 s[14:15], s[80:81], 16 -; SI-NEXT: v_writelane_b32 v62, s14, 14 -; SI-NEXT: v_writelane_b32 v62, s15, 15 -; SI-NEXT: s_lshr_b64 s[14:15], s[80:81], 8 -; SI-NEXT: v_writelane_b32 v62, s14, 12 -; SI-NEXT: v_writelane_b32 v62, s15, 13 -; SI-NEXT: s_lshr_b64 s[14:15], s[66:67], 24 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_lshr_b32 s11, s4, 16 +; SI-NEXT: s_lshr_b64 s[98:99], s[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_readfirstlane_b32 s86, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: s_lshr_b32 s87, s4, 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[86:87], 16 +; SI-NEXT: s_mov_b32 s99, s10 +; SI-NEXT: s_lshr_b64 s[4:5], s[98:99], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 4 +; SI-NEXT: v_writelane_b32 v62, s5, 5 +; SI-NEXT: s_lshr_b64 s[4:5], s[98:99], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 2 +; SI-NEXT: v_writelane_b32 v62, s5, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[98:99], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 0 +; SI-NEXT: v_writelane_b32 v62, s5, 1 +; SI-NEXT: s_lshr_b64 s[4:5], s[96:97], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: v_writelane_b32 v62, s5, 11 +; SI-NEXT: s_lshr_b64 s[4:5], s[96:97], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 8 +; SI-NEXT: v_writelane_b32 v62, s5, 9 +; SI-NEXT: s_lshr_b64 s[4:5], s[96:97], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 6 +; SI-NEXT: v_writelane_b32 v62, s5, 7 +; SI-NEXT: s_lshr_b64 s[4:5], s[80:81], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: v_writelane_b32 v62, s5, 17 +; SI-NEXT: s_lshr_b64 s[4:5], s[80:81], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_writelane_b32 v62, s5, 15 +; SI-NEXT: s_lshr_b64 s[4:5], s[80:81], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: v_writelane_b32 v62, s14, 22 ; SI-NEXT: v_writelane_b32 v62, s15, 23 -; SI-NEXT: s_lshr_b64 s[14:15], s[66:67], 16 -; SI-NEXT: v_writelane_b32 v62, s14, 20 -; SI-NEXT: v_writelane_b32 v62, s15, 21 -; SI-NEXT: s_lshr_b64 s[14:15], s[66:67], 8 -; SI-NEXT: v_writelane_b32 v62, s14, 18 -; SI-NEXT: v_writelane_b32 v62, s15, 19 +; SI-NEXT: s_lshr_b64 s[4:5], s[66:67], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 20 +; SI-NEXT: v_writelane_b32 v62, s5, 21 +; SI-NEXT: s_lshr_b64 s[4:5], s[66:67], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 18 +; SI-NEXT: v_writelane_b32 v62, s5, 19 ; SI-NEXT: s_lshr_b64 s[14:15], s[52:53], 24 ; SI-NEXT: v_writelane_b32 v62, s14, 28 ; SI-NEXT: v_writelane_b32 v62, s15, 29 -; SI-NEXT: s_lshr_b64 s[14:15], s[52:53], 16 -; SI-NEXT: v_writelane_b32 v62, s14, 26 -; SI-NEXT: v_writelane_b32 v62, s15, 27 -; SI-NEXT: s_lshr_b64 s[14:15], s[52:53], 8 -; SI-NEXT: v_writelane_b32 v62, s14, 24 -; SI-NEXT: v_writelane_b32 v62, s15, 25 -; SI-NEXT: s_lshr_b64 s[14:15], s[30:31], 24 -; SI-NEXT: v_writelane_b32 v62, s14, 34 -; SI-NEXT: v_writelane_b32 v62, s15, 35 -; SI-NEXT: s_lshr_b64 s[14:15], s[30:31], 16 -; SI-NEXT: v_writelane_b32 v62, s14, 32 -; SI-NEXT: v_writelane_b32 v62, s15, 33 -; SI-NEXT: s_lshr_b64 s[14:15], s[30:31], 8 -; SI-NEXT: v_writelane_b32 v62, s14, 30 -; SI-NEXT: v_writelane_b32 v62, s15, 31 -; SI-NEXT: s_lshr_b64 s[14:15], s[50:51], 24 -; SI-NEXT: v_writelane_b32 v62, s14, 40 -; SI-NEXT: v_writelane_b32 v62, s15, 41 -; SI-NEXT: s_lshr_b64 s[14:15], s[50:51], 16 -; SI-NEXT: v_writelane_b32 v62, s14, 38 -; SI-NEXT: v_writelane_b32 v62, s15, 39 -; SI-NEXT: s_lshr_b64 s[14:15], s[50:51], 8 -; SI-NEXT: v_writelane_b32 v62, s14, 36 -; SI-NEXT: v_writelane_b32 v62, s15, 37 -; SI-NEXT: s_lshr_b64 s[14:15], s[92:93], 24 -; SI-NEXT: v_writelane_b32 v62, s14, 46 -; SI-NEXT: v_writelane_b32 v62, s15, 47 -; SI-NEXT: s_lshr_b64 s[14:15], s[92:93], 16 -; SI-NEXT: v_writelane_b32 v62, s14, 44 -; SI-NEXT: v_writelane_b32 v62, s15, 45 -; SI-NEXT: s_lshr_b64 s[14:15], s[92:93], 8 -; SI-NEXT: v_writelane_b32 v62, s14, 42 -; SI-NEXT: v_writelane_b32 v62, s15, 43 -; SI-NEXT: s_lshr_b64 s[14:15], s[76:77], 24 -; SI-NEXT: v_writelane_b32 v62, s14, 52 -; SI-NEXT: v_writelane_b32 v62, s15, 53 -; SI-NEXT: s_lshr_b64 s[14:15], s[76:77], 16 -; SI-NEXT: v_writelane_b32 v62, s14, 50 -; SI-NEXT: v_writelane_b32 v62, s15, 51 -; SI-NEXT: s_lshr_b64 s[14:15], s[76:77], 8 -; SI-NEXT: v_writelane_b32 v62, s14, 48 -; SI-NEXT: v_writelane_b32 v62, s15, 49 -; SI-NEXT: s_lshr_b64 s[14:15], s[62:63], 24 -; SI-NEXT: v_writelane_b32 v62, s14, 58 -; SI-NEXT: v_writelane_b32 v62, s15, 59 -; SI-NEXT: s_lshr_b64 s[14:15], s[62:63], 16 -; SI-NEXT: v_writelane_b32 v62, s14, 56 -; SI-NEXT: v_writelane_b32 v62, s15, 57 -; SI-NEXT: s_lshr_b64 s[14:15], s[62:63], 8 -; SI-NEXT: v_writelane_b32 v62, s14, 54 -; SI-NEXT: v_writelane_b32 v62, s15, 55 -; SI-NEXT: s_lshr_b64 s[14:15], s[46:47], 24 -; SI-NEXT: v_writelane_b32 v61, s14, 0 -; SI-NEXT: v_writelane_b32 v61, s15, 1 -; SI-NEXT: s_lshr_b64 s[14:15], s[46:47], 16 -; SI-NEXT: v_writelane_b32 v62, s14, 62 -; SI-NEXT: v_writelane_b32 v62, s15, 63 -; SI-NEXT: s_lshr_b64 s[14:15], s[46:47], 8 -; SI-NEXT: v_writelane_b32 v62, s14, 60 -; SI-NEXT: v_writelane_b32 v62, s15, 61 -; SI-NEXT: s_lshr_b64 s[14:15], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v61, s14, 6 -; SI-NEXT: v_writelane_b32 v61, s15, 7 -; SI-NEXT: s_lshr_b64 s[14:15], s[40:41], 16 -; SI-NEXT: v_writelane_b32 v61, s14, 4 -; SI-NEXT: v_writelane_b32 v61, s15, 5 -; SI-NEXT: s_lshr_b64 s[14:15], s[40:41], 8 -; SI-NEXT: v_writelane_b32 v61, s14, 2 -; SI-NEXT: v_writelane_b32 v61, s15, 3 -; SI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; SI-NEXT: v_writelane_b32 v61, s14, 12 -; SI-NEXT: v_writelane_b32 v61, s15, 13 -; SI-NEXT: s_lshr_b64 s[14:15], s[24:25], 16 -; SI-NEXT: v_writelane_b32 v61, s14, 10 -; SI-NEXT: v_writelane_b32 v61, s15, 11 -; SI-NEXT: s_lshr_b64 s[14:15], s[24:25], 8 -; SI-NEXT: v_writelane_b32 v61, s14, 8 -; SI-NEXT: v_writelane_b32 v61, s15, 9 -; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 24 -; SI-NEXT: v_writelane_b32 v61, s14, 18 -; SI-NEXT: v_writelane_b32 v61, s15, 19 -; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 -; SI-NEXT: v_writelane_b32 v61, s14, 16 -; SI-NEXT: v_writelane_b32 v61, s15, 17 -; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 8 -; SI-NEXT: v_writelane_b32 v61, s14, 14 -; SI-NEXT: v_writelane_b32 v61, s15, 15 -; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 24 -; SI-NEXT: v_writelane_b32 v61, s14, 24 -; SI-NEXT: v_writelane_b32 v61, s15, 25 -; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16 -; SI-NEXT: v_writelane_b32 v61, s14, 22 -; SI-NEXT: v_writelane_b32 v61, s15, 23 -; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 8 -; SI-NEXT: v_writelane_b32 v61, s14, 20 -; SI-NEXT: v_writelane_b32 v61, s15, 21 -; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 24 -; SI-NEXT: v_writelane_b32 v61, s14, 32 -; SI-NEXT: v_writelane_b32 v61, s15, 33 -; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 -; SI-NEXT: v_writelane_b32 v61, s14, 30 -; SI-NEXT: v_writelane_b32 v61, s15, 31 -; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 8 -; SI-NEXT: v_writelane_b32 v61, s14, 28 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v5 -; SI-NEXT: v_writelane_b32 v61, s15, 29 -; SI-NEXT: s_lshr_b32 s78, s96, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[52:53], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 26 +; SI-NEXT: v_writelane_b32 v62, s5, 27 +; SI-NEXT: s_lshr_b64 s[4:5], s[52:53], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 24 +; SI-NEXT: v_writelane_b32 v62, s5, 25 +; SI-NEXT: s_lshr_b64 s[4:5], s[36:37], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 34 +; SI-NEXT: v_writelane_b32 v62, s5, 35 +; SI-NEXT: s_lshr_b64 s[4:5], s[36:37], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 32 +; SI-NEXT: v_writelane_b32 v62, s5, 33 +; SI-NEXT: s_lshr_b64 s[4:5], s[36:37], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 30 +; SI-NEXT: v_writelane_b32 v62, s5, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[94:95], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 40 +; SI-NEXT: v_writelane_b32 v62, s5, 41 +; SI-NEXT: s_lshr_b64 s[4:5], s[94:95], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 38 +; SI-NEXT: v_writelane_b32 v62, s5, 39 +; SI-NEXT: s_lshr_b64 s[4:5], s[94:95], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 36 +; SI-NEXT: v_writelane_b32 v62, s5, 37 +; SI-NEXT: s_lshr_b64 s[4:5], s[88:89], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 46 +; SI-NEXT: v_writelane_b32 v62, s5, 47 +; SI-NEXT: s_lshr_b64 s[4:5], s[88:89], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 44 +; SI-NEXT: v_writelane_b32 v62, s5, 45 +; SI-NEXT: s_lshr_b64 s[4:5], s[88:89], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 42 +; SI-NEXT: v_writelane_b32 v62, s5, 43 +; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 52 +; SI-NEXT: v_writelane_b32 v62, s5, 53 +; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 50 +; SI-NEXT: v_writelane_b32 v62, s5, 51 +; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 48 +; SI-NEXT: v_writelane_b32 v62, s5, 49 +; SI-NEXT: s_lshr_b64 s[4:5], s[60:61], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 58 +; SI-NEXT: v_writelane_b32 v62, s5, 59 +; SI-NEXT: s_lshr_b64 s[4:5], s[60:61], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 56 +; SI-NEXT: v_writelane_b32 v62, s5, 57 +; SI-NEXT: s_lshr_b64 s[4:5], s[60:61], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 54 +; SI-NEXT: v_writelane_b32 v62, s5, 55 +; SI-NEXT: s_lshr_b64 s[4:5], s[46:47], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 0 +; SI-NEXT: v_writelane_b32 v61, s5, 1 +; SI-NEXT: s_lshr_b64 s[4:5], s[46:47], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 62 +; SI-NEXT: v_writelane_b32 v62, s5, 63 +; SI-NEXT: s_lshr_b64 s[4:5], s[46:47], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 60 +; SI-NEXT: v_writelane_b32 v62, s5, 61 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 6 +; SI-NEXT: v_writelane_b32 v61, s5, 7 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v61, s4, 4 +; SI-NEXT: v_writelane_b32 v61, s5, 5 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 2 +; SI-NEXT: v_writelane_b32 v61, s5, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 12 +; SI-NEXT: v_writelane_b32 v61, s5, 13 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: v_writelane_b32 v61, s4, 10 +; SI-NEXT: v_writelane_b32 v61, s5, 11 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 8 +; SI-NEXT: v_writelane_b32 v61, s5, 9 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 18 +; SI-NEXT: v_writelane_b32 v61, s5, 19 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 +; SI-NEXT: v_writelane_b32 v61, s4, 16 +; SI-NEXT: v_writelane_b32 v61, s5, 17 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 14 +; SI-NEXT: v_writelane_b32 v61, s5, 15 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: v_writelane_b32 v61, s5, 27 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v61, s4, 24 +; SI-NEXT: v_writelane_b32 v61, s5, 25 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 22 +; SI-NEXT: v_writelane_b32 v61, s5, 23 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v61, s4, 32 +; SI-NEXT: v_writelane_b32 v61, s5, 33 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v61, s4, 30 +; SI-NEXT: v_writelane_b32 v61, s5, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], 8 +; SI-NEXT: v_writelane_b32 v61, s4, 28 +; SI-NEXT: v_writelane_b32 v61, s5, 29 +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v5 +; SI-NEXT: s_lshr_b32 s44, s10, 8 +; SI-NEXT: s_lshr_b32 s41, s54, 8 +; SI-NEXT: s_lshr_b32 s61, s34, 8 +; SI-NEXT: s_lshr_b32 s23, s76, 8 +; SI-NEXT: s_lshr_b32 s17, s78, 8 ; SI-NEXT: s_lshr_b32 s15, s26, 8 +; SI-NEXT: s_lshr_b32 s11, s30, 8 +; SI-NEXT: s_lshr_b32 s9, s56, 8 ; SI-NEXT: s_mov_b32 s14, s20 -; SI-NEXT: s_lshr_b32 s6, vcc_lo, 8 -; SI-NEXT: v_mov_b32_e32 v14, v4 -; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_lshr_b32 s7, s20, 8 +; SI-NEXT: s_lshr_b32 s4, vcc_lo, 8 ; SI-NEXT: .LBB91_5: ; %end -; SI-NEXT: s_and_b32 s5, s8, 0xff -; SI-NEXT: v_readlane_b32 s8, v62, 0 -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v62, 1 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v62, 2 -; SI-NEXT: v_readlane_b32 s9, v62, 3 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 4 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, vcc_lo, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: s_and_b32 s5, s96, 0xff -; SI-NEXT: s_lshl_b32 s8, s78, 8 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_and_b32 s8, s7, 0xff +; SI-NEXT: v_readlane_b32 s26, v62, 0 +; SI-NEXT: s_and_b32 s47, s98, 0xff +; SI-NEXT: s_lshl_b32 s57, s26, 8 +; SI-NEXT: v_readlane_b32 s26, v62, 2 +; SI-NEXT: s_or_b32 s47, s47, s57 +; SI-NEXT: s_and_b32 s57, s26, 0xff +; SI-NEXT: v_readlane_b32 s26, v62, 4 +; SI-NEXT: s_lshl_b32 s57, s57, 16 +; SI-NEXT: s_lshl_b32 s58, s26, 24 +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: s_or_b32 s57, s58, s57 +; SI-NEXT: s_or_b32 s47, s47, s57 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s44, s44, 8 +; SI-NEXT: v_mov_b32_e32 v1, s47 +; SI-NEXT: s_or_b32 s10, s10, s44 +; SI-NEXT: s_and_b32 s44, s87, 0xff ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s44, s44, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v48 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v1, s8, v1 -; SI-NEXT: v_readlane_b32 s8, v62, 6 -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s86, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s44, v1 +; SI-NEXT: v_readlane_b32 s26, v62, 6 +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: s_and_b32 s10, s96, 0xff +; SI-NEXT: s_lshl_b32 s44, s26, 8 +; SI-NEXT: v_readlane_b32 s26, v62, 8 +; SI-NEXT: s_or_b32 s10, s10, s44 +; SI-NEXT: s_and_b32 s44, s26, 0xff +; SI-NEXT: v_readlane_b32 s26, v62, 10 +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: s_lshl_b32 s47, s26, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s44, s47, s44 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: v_readlane_b32 s9, v62, 7 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v62, 8 -; SI-NEXT: v_readlane_b32 s9, v62, 9 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 10 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, vcc_lo, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s10, s10, s44 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 11 -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: s_and_b32 s5, s84, 0xff -; SI-NEXT: s_lshl_b32 s8, s61, 8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s42, 0xff +; SI-NEXT: s_lshl_b32 s43, s43, 8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_and_b32 s8, s65, 0xff +; SI-NEXT: s_or_b32 s10, s10, s43 +; SI-NEXT: s_and_b32 s43, s83, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v16 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v1, s8, v1 -; SI-NEXT: v_readlane_b32 s8, v62, 12 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s80, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v62, 13 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v62, 14 -; SI-NEXT: v_readlane_b32 s9, v62, 15 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s60, v62, 16 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s60, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_lshl_b32 s43, s43, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v18 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s43, v1 +; SI-NEXT: v_readlane_b32 s26, v62, 12 +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: s_and_b32 s10, s80, 0xff +; SI-NEXT: s_lshl_b32 s43, s26, 8 +; SI-NEXT: v_readlane_b32 s26, v62, 14 +; SI-NEXT: s_or_b32 s10, s10, s43 +; SI-NEXT: s_and_b32 s43, s26, 0xff +; SI-NEXT: v_readlane_b32 s26, v62, 16 +; SI-NEXT: s_lshl_b32 s43, s43, 16 +; SI-NEXT: s_lshl_b32 s44, s26, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s43, s44, s43 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s10, s10, s43 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: s_and_b32 s5, s70, 0xff -; SI-NEXT: s_lshl_b32 s8, s72, 8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s54, 0xff +; SI-NEXT: s_lshl_b32 s41, s41, 8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_and_b32 s8, s69, 0xff +; SI-NEXT: s_or_b32 s10, s10, s41 +; SI-NEXT: s_and_b32 s41, s69, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v1, s8, v1 -; SI-NEXT: v_readlane_b32 s8, v62, 18 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s66, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v62, 19 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v62, 20 -; SI-NEXT: v_readlane_b32 s9, v62, 21 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s60, v62, 22 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s60, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_lshl_b32 s41, s41, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s41, v1 +; SI-NEXT: v_readlane_b32 s26, v62, 18 +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: s_and_b32 s10, s66, 0xff +; SI-NEXT: s_lshl_b32 s41, s26, 8 +; SI-NEXT: v_readlane_b32 s26, v62, 20 +; SI-NEXT: v_readlane_b32 s27, v62, 1 +; SI-NEXT: s_or_b32 s10, s10, s41 +; SI-NEXT: s_and_b32 s41, s26, 0xff +; SI-NEXT: v_readlane_b32 s42, v62, 22 +; SI-NEXT: v_readlane_b32 s43, v62, 23 +; SI-NEXT: v_readlane_b32 s27, v62, 3 +; SI-NEXT: s_lshl_b32 s41, s41, 16 +; SI-NEXT: s_lshl_b32 s43, s42, 24 +; SI-NEXT: v_readlane_b32 s27, v62, 5 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s41, s43, s41 +; SI-NEXT: v_readlane_b32 s27, v62, 7 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s10, s10, s41 +; SI-NEXT: v_readlane_b32 s27, v62, 9 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: s_and_b32 s5, s38, 0xff -; SI-NEXT: s_lshl_b32 s8, s75, 8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s38, 0xff +; SI-NEXT: s_lshl_b32 s28, s28, 8 +; SI-NEXT: v_readlane_b32 s27, v62, 11 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_and_b32 s8, s91, 0xff +; SI-NEXT: s_or_b32 s10, s10, s28 +; SI-NEXT: s_and_b32 s28, s19, 0xff +; SI-NEXT: v_readlane_b32 s27, v62, 13 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v1, s8, v1 -; SI-NEXT: v_readlane_b32 s8, v62, 24 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s52, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v62, 25 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v62, 26 -; SI-NEXT: v_readlane_b32 s9, v62, 27 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s60, v62, 28 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s60, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v30 +; SI-NEXT: v_readlane_b32 s27, v62, 15 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s28, v1 +; SI-NEXT: v_readlane_b32 s26, v62, 24 +; SI-NEXT: v_readlane_b32 s27, v62, 17 +; SI-NEXT: v_or_b32_e32 v1, s10, v1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_and_b32 s10, s52, 0xff +; SI-NEXT: s_lshl_b32 s28, s26, 8 +; SI-NEXT: v_readlane_b32 s26, v62, 26 +; SI-NEXT: v_readlane_b32 s27, v62, 19 +; SI-NEXT: s_or_b32 s10, s10, s28 +; SI-NEXT: s_and_b32 s28, s26, 0xff +; SI-NEXT: v_readlane_b32 vcc_lo, v62, 28 +; SI-NEXT: v_readlane_b32 s27, v62, 21 +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_lshl_b32 s41, vcc_lo, 24 +; SI-NEXT: v_readlane_b32 s27, v62, 25 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s28, s41, s28 +; SI-NEXT: v_readlane_b32 s27, v62, 27 +; SI-NEXT: s_or_b32 s10, s10, s28 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: s_and_b32 s5, s98, 0xff -; SI-NEXT: s_lshl_b32 s8, s58, 8 +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 29 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s34, 0xff +; SI-NEXT: s_lshl_b32 s27, s61, 8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_and_b32 s8, s37, 0xff +; SI-NEXT: s_or_b32 s10, s10, s27 +; SI-NEXT: s_and_b32 s27, s13, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v11 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v1, s8, v1 -; SI-NEXT: v_readlane_b32 s8, v62, 30 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s30, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v62, 31 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v62, 32 -; SI-NEXT: v_readlane_b32 s9, v62, 33 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s60, v62, 34 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s60, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_lshl_b32 s27, s27, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v37 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s27, v1 +; SI-NEXT: v_readlane_b32 s26, v62, 30 +; SI-NEXT: v_readlane_b32 s27, v62, 31 +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: s_and_b32 s10, s36, 0xff +; SI-NEXT: s_lshl_b32 s27, s26, 8 +; SI-NEXT: s_or_b32 s10, s10, s27 +; SI-NEXT: v_readlane_b32 s26, v62, 32 +; SI-NEXT: v_readlane_b32 s27, v62, 33 +; SI-NEXT: s_and_b32 s27, s26, 0xff +; SI-NEXT: v_readlane_b32 s42, v62, 34 +; SI-NEXT: s_lshl_b32 s27, s27, 16 +; SI-NEXT: s_lshl_b32 s28, s42, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s27, s28, s27 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s10, s10, s27 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: s_and_b32 s5, s82, 0xff -; SI-NEXT: s_lshl_b32 s8, s43, 8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s90, 0xff +; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_and_b32 s8, s89, 0xff +; SI-NEXT: s_or_b32 s10, s10, s24 +; SI-NEXT: s_and_b32 s24, s65, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v47 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v1, s8, v1 -; SI-NEXT: v_readlane_b32 s8, v62, 36 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s50, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v62, 37 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v62, 38 -; SI-NEXT: v_readlane_b32 s9, v62, 39 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s42, v62, 40 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s42, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s24, v1 +; SI-NEXT: v_readlane_b32 s26, v62, 36 +; SI-NEXT: v_readlane_b32 s27, v62, 37 +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: s_and_b32 s10, s94, 0xff +; SI-NEXT: s_lshl_b32 s24, s26, 8 +; SI-NEXT: v_readlane_b32 s26, v62, 38 +; SI-NEXT: v_readlane_b32 s27, v62, 39 +; SI-NEXT: s_or_b32 s10, s10, s24 +; SI-NEXT: s_and_b32 s24, s26, 0xff +; SI-NEXT: v_readlane_b32 s26, v62, 40 +; SI-NEXT: v_readlane_b32 s27, v62, 41 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_lshl_b32 s27, s26, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s24, s27, s24 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s10, s10, s24 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: s_and_b32 s5, s94, 0xff -; SI-NEXT: s_lshl_b32 s8, s44, 8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s76, 0xff +; SI-NEXT: s_lshl_b32 s23, s23, 8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_and_b32 s8, s57, 0xff +; SI-NEXT: s_or_b32 s10, s10, s23 +; SI-NEXT: s_and_b32 s23, s71, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v1, s8, v1 -; SI-NEXT: v_readlane_b32 s8, v62, 42 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s92, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v62, 43 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v62, 44 -; SI-NEXT: v_readlane_b32 s9, v62, 45 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s42, v62, 46 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s42, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v36 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s23, v1 +; SI-NEXT: v_readlane_b32 s26, v62, 42 +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: s_and_b32 s10, s88, 0xff +; SI-NEXT: s_lshl_b32 s23, s26, 8 +; SI-NEXT: v_readlane_b32 s26, v62, 44 +; SI-NEXT: s_or_b32 s10, s10, s23 +; SI-NEXT: s_and_b32 s23, s26, 0xff +; SI-NEXT: v_readlane_b32 s26, v62, 46 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_lshl_b32 s24, s26, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s23, s24, s23 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s10, s10, s23 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: s_and_b32 s5, s64, 0xff -; SI-NEXT: s_lshl_b32 s8, s27, 8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s92, 0xff +; SI-NEXT: s_lshl_b32 s21, s21, 8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_and_b32 s8, s79, 0xff +; SI-NEXT: s_or_b32 s10, s10, s21 +; SI-NEXT: s_and_b32 s21, s85, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v18 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v1, s8, v1 -; SI-NEXT: v_readlane_b32 s8, v62, 48 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s76, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v62, 49 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v62, 50 -; SI-NEXT: v_readlane_b32 s9, v62, 51 -; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s21, v1 +; SI-NEXT: v_readlane_b32 s20, v62, 48 +; SI-NEXT: v_readlane_b32 s21, v62, 49 +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: s_and_b32 s10, s74, 0xff +; SI-NEXT: s_lshl_b32 s21, s20, 8 +; SI-NEXT: s_or_b32 s10, s10, s21 +; SI-NEXT: v_readlane_b32 s20, v62, 50 +; SI-NEXT: v_readlane_b32 s21, v62, 51 +; SI-NEXT: s_and_b32 s21, s20, 0xff ; SI-NEXT: v_readlane_b32 s26, v62, 52 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s26, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_lshl_b32 s23, s26, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s21, s23, s21 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s10, s10, s21 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: s_and_b32 s5, s90, 0xff -; SI-NEXT: s_lshl_b32 s8, s28, 8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s62, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_and_b32 s8, s73, 0xff +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: s_and_b32 s18, s73, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v55 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v1, s8, v1 -; SI-NEXT: v_readlane_b32 s8, v62, 54 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s62, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v62, 55 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v62, 56 -; SI-NEXT: v_readlane_b32 s9, v62, 57 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s26, v62, 58 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s26, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s18, v1 +; SI-NEXT: v_readlane_b32 s18, v62, 54 +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: s_and_b32 s10, s60, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: s_or_b32 s10, s10, s18 +; SI-NEXT: v_readlane_b32 s18, v62, 56 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: v_readlane_b32 s20, v62, 58 +; SI-NEXT: v_readlane_b32 s21, v62, 59 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s21, s20, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s18, s21, s18 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s10, s10, s18 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: s_and_b32 s5, s54, 0xff -; SI-NEXT: s_lshl_b32 s8, s74, 8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s78, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_and_b32 s8, s59, 0xff +; SI-NEXT: s_or_b32 s10, s10, s17 +; SI-NEXT: s_and_b32 s17, s59, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v1, s8, v1 -; SI-NEXT: v_readlane_b32 s8, v62, 60 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s46, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v62, 61 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v62, 62 -; SI-NEXT: v_readlane_b32 s9, v62, 63 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s26, v61, 0 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s26, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s17, v1 +; SI-NEXT: v_readlane_b32 s18, v62, 60 +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: s_and_b32 s10, s46, 0xff +; SI-NEXT: s_lshl_b32 s17, s18, 8 +; SI-NEXT: v_readlane_b32 s18, v62, 62 +; SI-NEXT: s_or_b32 s10, s10, s17 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v61, 0 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s10, s10, s17 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: s_and_b32 s5, s68, 0xff -; SI-NEXT: s_lshl_b32 s8, s22, 8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_and_b32 s8, s45, 0xff +; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v4 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v1, s8, v1 -; SI-NEXT: v_readlane_b32 s8, v61, 2 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s40, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v61, 3 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v61, 4 -; SI-NEXT: v_readlane_b32 s9, v61, 5 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s26, v61, 6 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s26, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_and_b32 s10, s64, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 8 +; SI-NEXT: s_or_b32 s10, s10, s15 +; SI-NEXT: s_and_b32 s15, s45, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_readlane_b32 s18, v61, 2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: s_and_b32 s5, s56, 0xff -; SI-NEXT: s_lshl_b32 s8, s21, 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_and_b32 s8, s29, 0xff -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v14 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v1, s8, v1 -; SI-NEXT: v_readlane_b32 s8, v61, 8 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s24, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v61, 9 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v61, 10 -; SI-NEXT: v_readlane_b32 s9, v61, 11 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s20, v61, 12 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s20, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: s_and_b32 s5, s34, 0xff -; SI-NEXT: s_lshl_b32 s8, s18, 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_and_b32 s8, s23, 0xff -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v56 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v1, s8, v1 -; SI-NEXT: v_readlane_b32 s8, v61, 14 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s16, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v61, 15 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v61, 16 -; SI-NEXT: v_readlane_b32 s9, v61, 17 +; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: s_lshl_b32 s11, s11, 8 +; SI-NEXT: s_lshl_b32 s9, s9, 8 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s16, v61, 18 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s16, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 -; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_and_b32 s5, s36, 0xff -; SI-NEXT: s_lshl_b32 s8, s15, 8 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_and_b32 s8, s19, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_readlane_b32 s9, v61, 21 -; SI-NEXT: v_readlane_b32 s9, v61, 23 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s61, v62, 17 +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 8 +; SI-NEXT: v_readlane_b32 s19, v62, 55 +; SI-NEXT: v_readlane_b32 s19, v62, 57 +; SI-NEXT: v_readlane_b32 s19, v62, 61 +; SI-NEXT: v_readlane_b32 s19, v62, 63 +; SI-NEXT: v_readlane_b32 s19, v61, 1 +; SI-NEXT: v_readlane_b32 s13, v61, 9 +; SI-NEXT: v_readlane_b32 s27, v62, 43 +; SI-NEXT: v_readlane_b32 s19, v61, 3 +; SI-NEXT: v_readlane_b32 s13, v61, 11 +; SI-NEXT: v_readlane_b32 s27, v62, 45 +; SI-NEXT: v_readlane_b32 s19, v61, 5 +; SI-NEXT: v_readlane_b32 s13, v61, 15 +; SI-NEXT: v_readlane_b32 s27, v62, 47 +; SI-NEXT: v_readlane_b32 s19, v61, 7 +; SI-NEXT: v_readlane_b32 s13, v61, 17 +; SI-NEXT: v_readlane_b32 s43, v62, 35 ; SI-NEXT: v_readlane_b32 s27, v62, 53 -; SI-NEXT: v_readlane_b32 s61, v62, 23 -; SI-NEXT: v_readlane_b32 s27, v62, 59 -; SI-NEXT: v_readlane_b32 s61, v62, 29 -; SI-NEXT: v_readlane_b32 s43, v62, 41 -; SI-NEXT: v_readlane_b32 s27, v61, 1 -; SI-NEXT: v_readlane_b32 s61, v62, 35 -; SI-NEXT: v_readlane_b32 s43, v62, 47 -; SI-NEXT: v_readlane_b32 s27, v61, 7 -; SI-NEXT: v_readlane_b32 s21, v61, 13 -; SI-NEXT: v_readlane_b32 s17, v61, 19 -; SI-NEXT: v_readlane_b32 s11, v61, 25 +; SI-NEXT: v_readlane_b32 s19, v61, 13 +; SI-NEXT: v_readlane_b32 s13, v61, 19 ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -169259,9 +168535,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s53, v63, 13 ; SI-NEXT: v_readlane_b32 s52, v63, 12 ; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s48, v63, 8 ; SI-NEXT: v_readlane_b32 s39, v63, 7 ; SI-NEXT: v_readlane_b32 s38, v63, 6 ; SI-NEXT: v_readlane_b32 s37, v63, 5 @@ -169269,74 +168542,144 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s35, v63, 3 ; SI-NEXT: v_readlane_b32 s34, v63, 2 ; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: v_or_b32_e32 v1, s8, v1 -; SI-NEXT: v_readlane_b32 s8, v61, 20 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_and_b32 s5, s10, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v61, 22 -; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_or_b32_e32 v1, s15, v1 +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: s_and_b32 s10, s40, 0xff +; SI-NEXT: s_lshl_b32 s15, s18, 8 +; SI-NEXT: v_readlane_b32 s18, v61, 4 +; SI-NEXT: s_or_b32 s10, s10, s15 +; SI-NEXT: s_and_b32 s15, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v61, 6 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_lshl_b32 s17, s18, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s15, s17, s15 +; SI-NEXT: s_or_b32 s10, s10, s15 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s50, 0xff +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s12, s29, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s12, v1 +; SI-NEXT: v_readlane_b32 s12, v61, 8 +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: s_and_b32 s10, s22, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: v_readlane_b32 s12, v61, 10 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: v_readlane_b32 s18, v61, 12 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s15, s18, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s12, s15, s12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s30, 0xff +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s25, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v7 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s11, v1 +; SI-NEXT: v_readlane_b32 s12, v61, 14 +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: s_and_b32 s10, s16, 0xff +; SI-NEXT: s_lshl_b32 s11, s12, 8 +; SI-NEXT: v_readlane_b32 s12, v61, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s12, 0xff +; SI-NEXT: v_readlane_b32 s12, v61, 18 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s12, s12, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s48, 0xff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: v_readlane_b32 s10, v61, 20 +; SI-NEXT: v_readlane_b32 s11, v61, 21 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 +; SI-NEXT: s_and_b32 s10, s11, 0xff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v6 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: v_readlane_b32 s10, v61, 22 +; SI-NEXT: v_or_b32_e32 v1, s9, v1 +; SI-NEXT: s_lshl_b32 s9, s10, 8 ; SI-NEXT: v_readlane_b32 s10, v61, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s10, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s10, 0xff +; SI-NEXT: v_readlane_b32 s10, v61, 26 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s10, s10, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s14, 0xff ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 -; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s49, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_and_b32 s5, s14, 0xff -; SI-NEXT: s_lshl_b32 s8, s12, 8 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: v_readlane_b32 s8, v61, 26 -; SI-NEXT: v_readlane_b32 s9, v61, 27 -; SI-NEXT: s_and_b32 s8, s9, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: v_readlane_b32 s9, v61, 29 -; SI-NEXT: v_readlane_b32 s9, v61, 31 -; SI-NEXT: v_readlane_b32 s9, v61, 33 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v61, 28 -; SI-NEXT: v_or_b32_e32 v1, s5, v1 -; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_or_b32_e32 v1, s7, v1 +; SI-NEXT: s_lshl_b32 s7, s8, 8 ; SI-NEXT: v_readlane_b32 s8, v61, 30 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s8, 0xff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s8, 0xff ; SI-NEXT: v_readlane_b32 s8, v61, 32 -; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s8, 24 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_readlane_b32 s7, v61, 35 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 -; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s72, 0xff +; SI-NEXT: s_and_b32 s5, s7, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_and_b32 s4, s88, 0xff -; SI-NEXT: s_lshl_b32 s5, s6, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s13, 0xff +; SI-NEXT: s_or_b32 s4, s6, s4 ; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v8 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -169351,10 +168694,21 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s11, v61, 23 +; SI-NEXT: v_readlane_b32 s9, v61, 29 +; SI-NEXT: v_readlane_b32 s11, v61, 25 +; SI-NEXT: v_readlane_b32 s9, v61, 31 +; SI-NEXT: v_readlane_b32 s11, v61, 27 +; SI-NEXT: v_readlane_b32 s9, v61, 33 +; SI-NEXT: v_readlane_b32 s6, v61, 34 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -169451,184 +168805,184 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane ; VI-NEXT: s_cbranch_scc0 .LBB91_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s19, 24 -; VI-NEXT: v_writelane_b32 v22, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s19, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s19, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s18, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s18, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 14 -; VI-NEXT: s_lshr_b32 s46, s21, 24 -; VI-NEXT: v_writelane_b32 v22, s46, 15 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 16 -; VI-NEXT: s_lshr_b32 s46, s21, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 17 -; VI-NEXT: s_lshr_b32 s46, s20, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 18 -; VI-NEXT: s_lshr_b32 s46, s20, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s23, 24 -; VI-NEXT: v_writelane_b32 v22, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s23, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s23, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s22, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s22, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s25, 24 -; VI-NEXT: v_writelane_b32 v22, s46, 25 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 26 -; VI-NEXT: s_lshr_b32 s46, s25, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 27 -; VI-NEXT: s_lshr_b32 s46, s24, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 28 -; VI-NEXT: s_lshr_b32 s46, s24, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 29 -; VI-NEXT: s_lshr_b32 s46, s27, 24 -; VI-NEXT: v_writelane_b32 v22, s46, 30 -; VI-NEXT: s_lshr_b32 s46, s27, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 31 -; VI-NEXT: s_lshr_b32 s46, s27, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 32 -; VI-NEXT: s_lshr_b32 s46, s26, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 33 -; VI-NEXT: s_lshr_b32 s46, s26, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 34 -; VI-NEXT: s_lshr_b32 s46, s29, 24 -; VI-NEXT: v_writelane_b32 v22, s46, 35 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 36 -; VI-NEXT: s_lshr_b32 s46, s29, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 37 -; VI-NEXT: s_lshr_b32 s46, s28, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 38 -; VI-NEXT: s_lshr_b32 s46, s28, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 39 -; VI-NEXT: s_lshr_b32 s46, s5, 24 -; VI-NEXT: v_writelane_b32 v22, s46, 40 -; VI-NEXT: s_lshr_b32 s46, s5, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 41 -; VI-NEXT: s_lshr_b32 s46, s5, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 42 -; VI-NEXT: s_lshr_b32 s46, s4, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 43 -; VI-NEXT: s_lshr_b32 s46, s4, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 44 -; VI-NEXT: s_lshr_b32 s46, s7, 24 -; VI-NEXT: v_writelane_b32 v22, s46, 45 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 46 -; VI-NEXT: s_lshr_b32 s46, s7, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 47 -; VI-NEXT: s_lshr_b32 s46, s6, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 48 -; VI-NEXT: s_lshr_b32 s46, s6, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 49 -; VI-NEXT: s_lshr_b32 s46, s9, 24 -; VI-NEXT: v_writelane_b32 v22, s46, 50 -; VI-NEXT: s_lshr_b32 s46, s9, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 51 -; VI-NEXT: s_lshr_b32 s46, s9, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 52 -; VI-NEXT: s_lshr_b32 s46, s8, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 53 -; VI-NEXT: s_lshr_b32 s46, s8, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 54 -; VI-NEXT: s_lshr_b32 s46, s11, 24 -; VI-NEXT: v_writelane_b32 v22, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s11, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 56 -; VI-NEXT: s_lshr_b32 s46, s11, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 57 -; VI-NEXT: s_lshr_b32 s46, s10, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 58 -; VI-NEXT: s_lshr_b32 s46, s10, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 59 -; VI-NEXT: s_lshr_b32 s46, s13, 24 -; VI-NEXT: v_writelane_b32 v22, s46, 60 -; VI-NEXT: s_lshr_b32 s46, s13, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 61 -; VI-NEXT: s_lshr_b32 s46, s13, 8 -; VI-NEXT: v_writelane_b32 v22, s46, 62 -; VI-NEXT: s_lshr_b32 s46, s12, 16 -; VI-NEXT: v_writelane_b32 v22, s46, 63 -; VI-NEXT: s_lshr_b32 s46, s12, 8 -; VI-NEXT: s_lshr_b64 vcc, s[4:5], 24 -; VI-NEXT: v_writelane_b32 v21, s46, 0 -; VI-NEXT: s_lshr_b32 s46, s15, 24 -; VI-NEXT: v_writelane_b32 v22, vcc_lo, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 1 -; VI-NEXT: s_lshr_b32 s46, s15, 16 -; VI-NEXT: v_writelane_b32 v22, vcc_hi, 9 -; VI-NEXT: s_lshr_b64 vcc, s[6:7], 24 -; VI-NEXT: v_writelane_b32 v21, s46, 2 -; VI-NEXT: s_lshr_b32 s46, s15, 8 -; VI-NEXT: v_writelane_b32 v22, vcc_lo, 6 -; VI-NEXT: v_writelane_b32 v21, s46, 3 -; VI-NEXT: s_lshr_b32 s46, s14, 16 -; VI-NEXT: v_writelane_b32 v22, vcc_hi, 7 -; VI-NEXT: s_lshr_b64 vcc, s[8:9], 24 -; VI-NEXT: v_writelane_b32 v21, s46, 4 -; VI-NEXT: s_lshr_b32 s46, s14, 8 +; VI-NEXT: s_lshr_b32 s47, s21, 24 +; VI-NEXT: v_writelane_b32 v22, s47, 22 +; VI-NEXT: s_lshr_b32 s47, s21, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 23 +; VI-NEXT: s_lshr_b32 s47, s21, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 24 +; VI-NEXT: s_lshr_b32 s47, s20, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 25 +; VI-NEXT: s_lshr_b32 s47, s20, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 26 +; VI-NEXT: s_lshr_b32 s47, s23, 24 +; VI-NEXT: v_writelane_b32 v22, s47, 27 +; VI-NEXT: s_lshr_b32 s47, s23, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 28 +; VI-NEXT: s_lshr_b32 s47, s23, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 29 +; VI-NEXT: s_lshr_b32 s47, s22, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 30 +; VI-NEXT: s_lshr_b32 s47, s22, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 31 +; VI-NEXT: s_lshr_b32 s47, s25, 24 +; VI-NEXT: v_writelane_b32 v22, s47, 32 +; VI-NEXT: s_lshr_b32 s47, s25, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 33 +; VI-NEXT: s_lshr_b32 s47, s25, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 34 +; VI-NEXT: s_lshr_b32 s47, s24, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 35 +; VI-NEXT: s_lshr_b32 s47, s24, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 36 +; VI-NEXT: s_lshr_b32 s47, s27, 24 +; VI-NEXT: v_writelane_b32 v22, s47, 37 +; VI-NEXT: s_lshr_b32 s47, s27, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 38 +; VI-NEXT: s_lshr_b32 s47, s27, 8 +; VI-NEXT: v_writelane_b32 v22, s47, 39 +; VI-NEXT: s_lshr_b32 s47, s26, 16 +; VI-NEXT: v_writelane_b32 v22, s47, 40 +; VI-NEXT: s_lshr_b32 s59, s5, 24 +; VI-NEXT: v_writelane_b32 v22, s59, 41 +; VI-NEXT: s_lshr_b32 s59, s5, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 42 +; VI-NEXT: s_lshr_b32 s59, s5, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 43 +; VI-NEXT: s_lshr_b32 s59, s4, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 44 +; VI-NEXT: s_lshr_b32 s59, s4, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 45 +; VI-NEXT: s_lshr_b32 s59, s7, 24 +; VI-NEXT: v_writelane_b32 v22, s59, 46 +; VI-NEXT: s_lshr_b32 s59, s7, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 47 +; VI-NEXT: s_lshr_b32 s59, s7, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 48 +; VI-NEXT: s_lshr_b32 s59, s6, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 49 +; VI-NEXT: s_lshr_b32 s59, s6, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 50 +; VI-NEXT: s_lshr_b32 s59, s9, 24 +; VI-NEXT: v_writelane_b32 v22, s59, 51 +; VI-NEXT: s_lshr_b32 s59, s9, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 52 +; VI-NEXT: s_lshr_b32 s59, s9, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 53 +; VI-NEXT: s_lshr_b32 s59, s8, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 54 +; VI-NEXT: s_lshr_b32 s59, s8, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 55 +; VI-NEXT: s_lshr_b32 s59, s11, 24 +; VI-NEXT: v_writelane_b32 v22, s59, 56 +; VI-NEXT: s_lshr_b32 s59, s11, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 57 +; VI-NEXT: s_lshr_b32 s59, s11, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 58 +; VI-NEXT: s_lshr_b32 s59, s10, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 59 +; VI-NEXT: s_lshr_b32 s59, s10, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 60 +; VI-NEXT: s_lshr_b32 s59, s13, 24 +; VI-NEXT: v_writelane_b32 v22, s59, 61 +; VI-NEXT: s_lshr_b32 s59, s13, 16 +; VI-NEXT: v_writelane_b32 v22, s59, 62 +; VI-NEXT: s_lshr_b32 s59, s13, 8 +; VI-NEXT: v_writelane_b32 v22, s59, 63 +; VI-NEXT: s_lshr_b64 vcc, s[18:19], 24 ; VI-NEXT: v_writelane_b32 v22, vcc_lo, 4 -; VI-NEXT: v_writelane_b32 v21, s46, 5 -; VI-NEXT: s_lshr_b32 s46, s17, 24 ; VI-NEXT: v_writelane_b32 v22, vcc_hi, 5 -; VI-NEXT: s_lshr_b64 vcc, s[10:11], 24 -; VI-NEXT: v_writelane_b32 v21, s46, 6 -; VI-NEXT: s_lshr_b32 s46, s17, 16 +; VI-NEXT: s_lshr_b64 vcc, s[20:21], 24 ; VI-NEXT: v_writelane_b32 v22, vcc_lo, 2 -; VI-NEXT: v_writelane_b32 v21, s46, 7 -; VI-NEXT: s_lshr_b32 s46, s17, 8 ; VI-NEXT: v_writelane_b32 v22, vcc_hi, 3 -; VI-NEXT: s_lshr_b64 vcc, s[12:13], 24 -; VI-NEXT: v_writelane_b32 v21, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s16, 16 +; VI-NEXT: s_lshr_b64 vcc, s[22:23], 24 ; VI-NEXT: v_writelane_b32 v22, vcc_lo, 0 -; VI-NEXT: s_lshr_b32 s47, s43, 24 -; VI-NEXT: s_lshr_b32 s57, s43, 16 -; VI-NEXT: s_lshr_b32 s61, s43, 8 -; VI-NEXT: s_lshr_b32 s75, s42, 16 -; VI-NEXT: s_lshr_b32 s79, s42, 8 -; VI-NEXT: s_lshr_b32 s89, s45, 24 -; VI-NEXT: s_lshr_b32 s91, s45, 16 -; VI-NEXT: s_lshr_b32 s31, s45, 8 -; VI-NEXT: s_lshr_b32 s37, s44, 16 -; VI-NEXT: s_lshr_b32 s49, s44, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 9 -; VI-NEXT: s_lshr_b32 s59, s16, 8 -; VI-NEXT: s_lshr_b32 s63, s41, 24 -; VI-NEXT: s_lshr_b32 s73, s41, 16 -; VI-NEXT: s_lshr_b32 s77, s41, 8 -; VI-NEXT: s_lshr_b32 s53, s40, 16 -; VI-NEXT: s_lshr_b32 s65, s40, 8 -; VI-NEXT: s_lshr_b64 s[80:81], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[82:83], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[86:87], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[50:51], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[54:55], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[66:67], s[44:45], 24 ; VI-NEXT: v_writelane_b32 v22, vcc_hi, 1 -; VI-NEXT: s_lshr_b64 s[68:69], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[70:71], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[84:85], s[40:41], 24 +; VI-NEXT: s_lshr_b64 vcc, s[4:5], 24 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 20 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 21 +; VI-NEXT: s_lshr_b64 vcc, s[6:7], 24 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 18 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 19 +; VI-NEXT: s_lshr_b64 vcc, s[8:9], 24 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 16 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 17 +; VI-NEXT: s_lshr_b64 vcc, s[10:11], 24 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 14 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 15 +; VI-NEXT: s_lshr_b64 vcc, s[12:13], 24 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 12 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 13 +; VI-NEXT: s_lshr_b64 vcc, s[14:15], 24 +; VI-NEXT: s_lshr_b32 s59, s12, 16 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 10 +; VI-NEXT: v_writelane_b32 v21, s59, 0 +; VI-NEXT: s_lshr_b32 s59, s12, 8 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 11 +; VI-NEXT: s_lshr_b64 vcc, s[16:17], 24 +; VI-NEXT: v_writelane_b32 v21, s59, 1 +; VI-NEXT: s_lshr_b32 s59, s15, 24 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 8 +; VI-NEXT: v_writelane_b32 v21, s59, 2 +; VI-NEXT: s_lshr_b32 s59, s15, 16 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 9 +; VI-NEXT: s_lshr_b64 vcc, s[40:41], 24 +; VI-NEXT: s_lshr_b32 s46, s19, 24 +; VI-NEXT: s_lshr_b32 s56, s19, 16 +; VI-NEXT: s_lshr_b32 s58, s19, 8 +; VI-NEXT: s_lshr_b32 s60, s18, 16 +; VI-NEXT: s_lshr_b32 s62, s18, 8 +; VI-NEXT: v_writelane_b32 v21, s59, 3 +; VI-NEXT: s_lshr_b32 s59, s15, 8 +; VI-NEXT: s_lshr_b64 s[86:87], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 6 +; VI-NEXT: s_lshr_b32 s47, s26, 8 +; VI-NEXT: s_lshr_b32 s67, s29, 24 +; VI-NEXT: s_lshr_b32 s57, s29, 16 +; VI-NEXT: s_lshr_b32 s69, s29, 8 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 8 +; VI-NEXT: s_lshr_b32 s53, s43, 24 +; VI-NEXT: s_lshr_b32 s71, s43, 16 +; VI-NEXT: s_lshr_b32 s77, s43, 8 +; VI-NEXT: s_lshr_b32 s91, s42, 16 +; VI-NEXT: s_lshr_b32 s83, s42, 8 +; VI-NEXT: s_lshr_b32 s54, s45, 24 +; VI-NEXT: s_lshr_b32 s51, s45, 16 +; VI-NEXT: s_lshr_b32 s65, s45, 8 +; VI-NEXT: s_lshr_b32 s81, s44, 16 +; VI-NEXT: s_lshr_b32 s85, s44, 8 +; VI-NEXT: v_writelane_b32 v21, s59, 4 +; VI-NEXT: s_lshr_b32 s66, s14, 16 +; VI-NEXT: s_lshr_b32 s68, s14, 8 +; VI-NEXT: s_lshr_b32 s59, s17, 24 +; VI-NEXT: s_lshr_b32 s61, s17, 16 +; VI-NEXT: s_lshr_b32 s73, s17, 8 +; VI-NEXT: s_lshr_b32 s52, s16, 16 +; VI-NEXT: s_lshr_b32 s70, s16, 8 +; VI-NEXT: s_lshr_b32 s79, s41, 24 +; VI-NEXT: s_lshr_b32 s89, s41, 16 +; VI-NEXT: s_lshr_b32 s82, s41, 8 +; VI-NEXT: s_lshr_b32 s35, s40, 16 +; VI-NEXT: s_lshr_b32 s55, s40, 8 +; VI-NEXT: s_mov_b32 s87, s62 +; VI-NEXT: s_mov_b32 s31, s46 +; VI-NEXT: s_mov_b32 s37, s56 +; VI-NEXT: s_mov_b32 s39, s58 +; VI-NEXT: s_mov_b32 s49, s60 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 7 ; VI-NEXT: s_mov_b32 s78, s45 ; VI-NEXT: s_mov_b32 s88, s43 -; VI-NEXT: s_mov_b32 s90, s29 -; VI-NEXT: s_mov_b32 s30, s27 -; VI-NEXT: s_mov_b32 s36, s25 -; VI-NEXT: s_mov_b32 s48, s23 -; VI-NEXT: s_mov_b32 s52, s21 -; VI-NEXT: s_mov_b32 s64, s19 +; VI-NEXT: s_mov_b32 s34, s29 +; VI-NEXT: s_mov_b32 s50, s27 +; VI-NEXT: s_mov_b32 s64, s25 +; VI-NEXT: s_mov_b32 s80, s23 +; VI-NEXT: s_mov_b32 s84, s21 +; VI-NEXT: s_mov_b32 s90, s19 ; VI-NEXT: s_mov_b32 s46, s41 ; VI-NEXT: s_mov_b32 s56, s17 ; VI-NEXT: s_mov_b32 s58, s15 @@ -170106,12 +169460,12 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_readfirstlane_b32 s29, v2 ; VI-NEXT: s_bfe_u32 s47, s29, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s29 -; VI-NEXT: s_lshr_b64 s[90:91], s[90:91], 16 +; VI-NEXT: s_lshr_b64 s[34:35], s[90:91], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_bitset1_b32 s29, 22 -; VI-NEXT: s_and_b64 vcc, vcc, exec -; VI-NEXT: s_cselect_b32 s30, s29, s47 +; VI-NEXT: s_and_b64 s[90:91], vcc, exec +; VI-NEXT: s_cselect_b32 s90, s29, s47 ; VI-NEXT: s_and_b32 s28, s28, 0xffff0000 ; VI-NEXT: v_add_f32_e32 v2, s28, v1 ; VI-NEXT: v_readfirstlane_b32 s28, v2 @@ -170126,14 +169480,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_f32_e32 v2, s47, v1 ; VI-NEXT: v_readfirstlane_b32 s47, v2 ; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 -; VI-NEXT: s_lshr_b32 s31, s28, 16 +; VI-NEXT: s_lshr_b32 s91, s28, 16 ; VI-NEXT: s_add_i32 s57, s57, s47 -; VI-NEXT: s_lshr_b64 s[28:29], s[30:31], 16 +; VI-NEXT: s_lshr_b64 s[28:29], s[90:91], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_bitset1_b32 s47, 22 -; VI-NEXT: s_and_b64 vcc, vcc, exec -; VI-NEXT: s_cselect_b32 s30, s47, s57 +; VI-NEXT: s_and_b64 s[90:91], vcc, exec +; VI-NEXT: s_cselect_b32 s90, s47, s57 ; VI-NEXT: s_and_b32 s27, s27, 0xffff0000 ; VI-NEXT: v_add_f32_e32 v2, s27, v1 ; VI-NEXT: v_readfirstlane_b32 s27, v2 @@ -170144,18 +169498,18 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bitset1_b32 s27, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s27, s27, s47 -; VI-NEXT: s_lshr_b32 s31, s27, 16 +; VI-NEXT: s_lshr_b32 s91, s27, 16 ; VI-NEXT: s_lshl_b32 s27, s26, 16 ; VI-NEXT: v_add_f32_e32 v2, s27, v1 ; VI-NEXT: v_readfirstlane_b32 s27, v2 ; VI-NEXT: s_bfe_u32 s47, s27, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s27 -; VI-NEXT: s_lshr_b64 s[30:31], s[30:31], 16 +; VI-NEXT: s_lshr_b64 s[50:51], s[90:91], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_bitset1_b32 s27, 22 -; VI-NEXT: s_and_b64 vcc, vcc, exec -; VI-NEXT: s_cselect_b32 s34, s27, s47 +; VI-NEXT: s_and_b64 s[90:91], vcc, exec +; VI-NEXT: s_cselect_b32 s90, s27, s47 ; VI-NEXT: s_and_b32 s26, s26, 0xffff0000 ; VI-NEXT: v_add_f32_e32 v2, s26, v1 ; VI-NEXT: v_readfirstlane_b32 s26, v2 @@ -170170,14 +169524,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_f32_e32 v2, s47, v1 ; VI-NEXT: v_readfirstlane_b32 s47, v2 ; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 -; VI-NEXT: s_lshr_b32 s35, s26, 16 +; VI-NEXT: s_lshr_b32 s91, s26, 16 ; VI-NEXT: s_add_i32 s57, s57, s47 -; VI-NEXT: s_lshr_b64 s[26:27], s[34:35], 16 +; VI-NEXT: s_lshr_b64 s[26:27], s[90:91], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_bitset1_b32 s47, 22 -; VI-NEXT: s_and_b64 vcc, vcc, exec -; VI-NEXT: s_cselect_b32 s34, s47, s57 +; VI-NEXT: s_and_b64 s[90:91], vcc, exec +; VI-NEXT: s_cselect_b32 s90, s47, s57 ; VI-NEXT: s_and_b32 s25, s25, 0xffff0000 ; VI-NEXT: v_add_f32_e32 v2, s25, v1 ; VI-NEXT: v_readfirstlane_b32 s25, v2 @@ -170188,18 +169542,18 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bitset1_b32 s25, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s25, s25, s47 -; VI-NEXT: s_lshr_b32 s35, s25, 16 +; VI-NEXT: s_lshr_b32 s91, s25, 16 ; VI-NEXT: s_lshl_b32 s25, s24, 16 ; VI-NEXT: v_add_f32_e32 v2, s25, v1 ; VI-NEXT: v_readfirstlane_b32 s25, v2 ; VI-NEXT: s_bfe_u32 s47, s25, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s25 -; VI-NEXT: s_lshr_b64 s[36:37], s[34:35], 16 +; VI-NEXT: s_lshr_b64 s[64:65], s[90:91], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_bitset1_b32 s25, 22 -; VI-NEXT: s_and_b64 vcc, vcc, exec -; VI-NEXT: s_cselect_b32 s34, s25, s47 +; VI-NEXT: s_and_b64 s[90:91], vcc, exec +; VI-NEXT: s_cselect_b32 s90, s25, s47 ; VI-NEXT: s_and_b32 s24, s24, 0xffff0000 ; VI-NEXT: v_add_f32_e32 v2, s24, v1 ; VI-NEXT: v_readfirstlane_b32 s24, v2 @@ -170214,14 +169568,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_f32_e32 v2, s47, v1 ; VI-NEXT: v_readfirstlane_b32 s47, v2 ; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 -; VI-NEXT: s_lshr_b32 s35, s24, 16 +; VI-NEXT: s_lshr_b32 s91, s24, 16 ; VI-NEXT: s_add_i32 s57, s57, s47 -; VI-NEXT: s_lshr_b64 s[24:25], s[34:35], 16 +; VI-NEXT: s_lshr_b64 s[24:25], s[90:91], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_bitset1_b32 s47, 22 -; VI-NEXT: s_and_b64 vcc, vcc, exec -; VI-NEXT: s_cselect_b32 s34, s47, s57 +; VI-NEXT: s_and_b64 s[90:91], vcc, exec +; VI-NEXT: s_cselect_b32 s90, s47, s57 ; VI-NEXT: s_and_b32 s23, s23, 0xffff0000 ; VI-NEXT: v_add_f32_e32 v2, s23, v1 ; VI-NEXT: v_readfirstlane_b32 s23, v2 @@ -170232,18 +169586,18 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bitset1_b32 s23, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s23, s23, s47 -; VI-NEXT: s_lshr_b32 s35, s23, 16 +; VI-NEXT: s_lshr_b32 s91, s23, 16 ; VI-NEXT: s_lshl_b32 s23, s22, 16 ; VI-NEXT: v_add_f32_e32 v2, s23, v1 ; VI-NEXT: v_readfirstlane_b32 s23, v2 ; VI-NEXT: s_bfe_u32 s47, s23, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s23 -; VI-NEXT: s_lshr_b64 s[48:49], s[34:35], 16 +; VI-NEXT: s_lshr_b64 s[80:81], s[90:91], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_bitset1_b32 s23, 22 -; VI-NEXT: s_and_b64 vcc, vcc, exec -; VI-NEXT: s_cselect_b32 s34, s23, s47 +; VI-NEXT: s_and_b64 s[90:91], vcc, exec +; VI-NEXT: s_cselect_b32 s90, s23, s47 ; VI-NEXT: s_and_b32 s22, s22, 0xffff0000 ; VI-NEXT: v_add_f32_e32 v2, s22, v1 ; VI-NEXT: v_readfirstlane_b32 s22, v2 @@ -170258,14 +169612,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_f32_e32 v2, s47, v1 ; VI-NEXT: v_readfirstlane_b32 s47, v2 ; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 -; VI-NEXT: s_lshr_b32 s35, s22, 16 +; VI-NEXT: s_lshr_b32 s91, s22, 16 ; VI-NEXT: s_add_i32 s57, s57, s47 -; VI-NEXT: s_lshr_b64 s[22:23], s[34:35], 16 +; VI-NEXT: s_lshr_b64 s[22:23], s[90:91], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_bitset1_b32 s47, 22 -; VI-NEXT: s_and_b64 vcc, vcc, exec -; VI-NEXT: s_cselect_b32 s34, s47, s57 +; VI-NEXT: s_and_b64 s[90:91], vcc, exec +; VI-NEXT: s_cselect_b32 s90, s47, s57 ; VI-NEXT: s_and_b32 s21, s21, 0xffff0000 ; VI-NEXT: v_add_f32_e32 v2, s21, v1 ; VI-NEXT: v_readfirstlane_b32 s21, v2 @@ -170276,18 +169630,18 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bitset1_b32 s21, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s21, s21, s47 -; VI-NEXT: s_lshr_b32 s35, s21, 16 +; VI-NEXT: s_lshr_b32 s91, s21, 16 ; VI-NEXT: s_lshl_b32 s21, s20, 16 ; VI-NEXT: v_add_f32_e32 v2, s21, v1 ; VI-NEXT: v_readfirstlane_b32 s21, v2 ; VI-NEXT: s_bfe_u32 s47, s21, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s21 -; VI-NEXT: s_lshr_b64 s[52:53], s[34:35], 16 +; VI-NEXT: s_lshr_b64 s[84:85], s[90:91], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_bitset1_b32 s21, 22 -; VI-NEXT: s_and_b64 vcc, vcc, exec -; VI-NEXT: s_cselect_b32 s34, s21, s47 +; VI-NEXT: s_and_b64 s[90:91], vcc, exec +; VI-NEXT: s_cselect_b32 s90, s21, s47 ; VI-NEXT: s_and_b32 s20, s20, 0xffff0000 ; VI-NEXT: v_add_f32_e32 v2, s20, v1 ; VI-NEXT: v_readfirstlane_b32 s20, v2 @@ -170302,14 +169656,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_f32_e32 v2, s47, v1 ; VI-NEXT: v_readfirstlane_b32 s47, v2 ; VI-NEXT: s_bfe_u32 s57, s47, 0x10010 -; VI-NEXT: s_lshr_b32 s35, s20, 16 +; VI-NEXT: s_lshr_b32 s91, s20, 16 ; VI-NEXT: s_add_i32 s57, s57, s47 -; VI-NEXT: s_lshr_b64 s[20:21], s[34:35], 16 +; VI-NEXT: s_lshr_b64 s[20:21], s[90:91], 16 ; VI-NEXT: s_addk_i32 s57, 0x7fff ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_bitset1_b32 s47, 22 -; VI-NEXT: s_and_b64 vcc, vcc, exec -; VI-NEXT: s_cselect_b32 s34, s47, s57 +; VI-NEXT: s_and_b64 s[90:91], vcc, exec +; VI-NEXT: s_cselect_b32 s90, s47, s57 ; VI-NEXT: s_and_b32 s19, s19, 0xffff0000 ; VI-NEXT: v_add_f32_e32 v2, s19, v1 ; VI-NEXT: v_readfirstlane_b32 s19, v2 @@ -170320,18 +169674,18 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_bitset1_b32 s19, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec ; VI-NEXT: s_cselect_b32 s19, s19, s47 -; VI-NEXT: s_lshr_b32 s35, s19, 16 +; VI-NEXT: s_lshr_b32 s91, s19, 16 ; VI-NEXT: s_lshl_b32 s19, s18, 16 ; VI-NEXT: v_add_f32_e32 v2, s19, v1 ; VI-NEXT: v_readfirstlane_b32 s19, v2 ; VI-NEXT: s_bfe_u32 s47, s19, 0x10010 ; VI-NEXT: s_add_i32 s47, s47, s19 -; VI-NEXT: s_lshr_b64 s[64:65], s[34:35], 16 +; VI-NEXT: s_lshr_b64 s[90:91], s[90:91], 16 ; VI-NEXT: s_addk_i32 s47, 0x7fff ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_bitset1_b32 s19, 22 ; VI-NEXT: s_and_b64 vcc, vcc, exec -; VI-NEXT: s_cselect_b32 s34, s19, s47 +; VI-NEXT: s_cselect_b32 s30, s19, s47 ; VI-NEXT: s_and_b32 s18, s18, 0xffff0000 ; VI-NEXT: v_add_f32_e32 v1, s18, v1 ; VI-NEXT: v_readfirstlane_b32 s18, v1 @@ -170342,486 +169696,470 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b64 s[18:19], vcc, exec ; VI-NEXT: s_cselect_b32 s18, s57, s47 -; VI-NEXT: s_lshr_b32 s47, s64, 24 -; VI-NEXT: s_lshr_b32 s35, s18, 16 -; VI-NEXT: v_writelane_b32 v22, s47, 10 -; VI-NEXT: s_lshr_b32 s47, s64, 16 -; VI-NEXT: s_lshr_b64 s[18:19], s[34:35], 16 -; VI-NEXT: v_writelane_b32 v22, s47, 11 -; VI-NEXT: s_lshr_b32 s47, s64, 8 -; VI-NEXT: v_writelane_b32 v22, s47, 12 -; VI-NEXT: s_lshr_b32 s47, s18, 16 -; VI-NEXT: v_writelane_b32 v22, s47, 13 -; VI-NEXT: s_lshr_b32 s47, s18, 8 -; VI-NEXT: v_writelane_b32 v22, s47, 14 -; VI-NEXT: s_lshr_b32 s47, s52, 24 -; VI-NEXT: v_writelane_b32 v22, s47, 15 -; VI-NEXT: s_lshr_b32 s47, s52, 16 -; VI-NEXT: v_writelane_b32 v22, s47, 16 -; VI-NEXT: s_lshr_b32 s47, s52, 8 -; VI-NEXT: v_writelane_b32 v22, s47, 17 -; VI-NEXT: s_lshr_b32 s47, s20, 16 -; VI-NEXT: v_writelane_b32 v22, s47, 18 -; VI-NEXT: s_lshr_b32 s47, s20, 8 -; VI-NEXT: v_writelane_b32 v22, s47, 19 -; VI-NEXT: s_lshr_b32 s47, s48, 24 -; VI-NEXT: v_writelane_b32 v22, s47, 20 -; VI-NEXT: s_lshr_b32 s47, s48, 16 -; VI-NEXT: v_writelane_b32 v22, s47, 21 -; VI-NEXT: s_lshr_b32 s47, s48, 8 -; VI-NEXT: v_writelane_b32 v22, s47, 22 -; VI-NEXT: s_lshr_b32 s47, s22, 16 -; VI-NEXT: v_writelane_b32 v22, s47, 23 -; VI-NEXT: s_lshr_b32 s47, s22, 8 -; VI-NEXT: v_writelane_b32 v22, s47, 24 -; VI-NEXT: s_lshr_b32 s47, s36, 24 -; VI-NEXT: v_writelane_b32 v22, s47, 25 -; VI-NEXT: s_lshr_b32 s47, s36, 16 -; VI-NEXT: v_writelane_b32 v22, s47, 26 -; VI-NEXT: s_lshr_b32 s47, s36, 8 -; VI-NEXT: v_writelane_b32 v22, s47, 27 -; VI-NEXT: s_lshr_b32 s47, s24, 16 -; VI-NEXT: v_writelane_b32 v22, s47, 28 -; VI-NEXT: s_lshr_b32 s47, s24, 8 -; VI-NEXT: v_writelane_b32 v22, s47, 29 -; VI-NEXT: s_lshr_b32 s47, s30, 24 -; VI-NEXT: v_writelane_b32 v22, s47, 30 -; VI-NEXT: s_lshr_b32 s47, s30, 16 -; VI-NEXT: v_writelane_b32 v22, s47, 31 -; VI-NEXT: s_lshr_b32 s47, s30, 8 -; VI-NEXT: v_writelane_b32 v22, s47, 32 -; VI-NEXT: s_lshr_b32 s47, s26, 16 -; VI-NEXT: v_writelane_b32 v22, s47, 33 -; VI-NEXT: s_lshr_b32 s47, s26, 8 -; VI-NEXT: v_writelane_b32 v22, s47, 34 -; VI-NEXT: s_lshr_b32 s47, s90, 24 -; VI-NEXT: v_writelane_b32 v22, s47, 35 -; VI-NEXT: s_lshr_b32 s47, s90, 16 -; VI-NEXT: v_writelane_b32 v22, s47, 36 -; VI-NEXT: s_lshr_b32 s47, s90, 8 -; VI-NEXT: v_writelane_b32 v22, s47, 37 -; VI-NEXT: s_lshr_b32 s47, s28, 16 -; VI-NEXT: v_writelane_b32 v22, s47, 38 -; VI-NEXT: s_lshr_b32 s47, s28, 8 -; VI-NEXT: v_writelane_b32 v22, s47, 39 -; VI-NEXT: s_lshr_b32 s59, s76, 24 -; VI-NEXT: v_writelane_b32 v22, s59, 40 -; VI-NEXT: s_lshr_b32 s59, s76, 16 -; VI-NEXT: v_writelane_b32 v22, s59, 41 -; VI-NEXT: s_lshr_b32 s59, s76, 8 -; VI-NEXT: v_writelane_b32 v22, s59, 42 -; VI-NEXT: s_lshr_b32 s59, s4, 16 -; VI-NEXT: v_writelane_b32 v22, s59, 43 -; VI-NEXT: s_lshr_b32 s59, s4, 8 -; VI-NEXT: v_writelane_b32 v22, s59, 44 -; VI-NEXT: s_lshr_b32 s59, s74, 24 -; VI-NEXT: v_writelane_b32 v22, s59, 45 -; VI-NEXT: s_lshr_b32 s59, s74, 16 -; VI-NEXT: v_writelane_b32 v22, s59, 46 -; VI-NEXT: s_lshr_b32 s59, s74, 8 -; VI-NEXT: v_writelane_b32 v22, s59, 47 -; VI-NEXT: s_lshr_b32 s59, s6, 16 -; VI-NEXT: v_writelane_b32 v22, s59, 48 -; VI-NEXT: s_lshr_b32 s59, s6, 8 -; VI-NEXT: v_writelane_b32 v22, s59, 49 -; VI-NEXT: s_lshr_b32 s59, s72, 24 -; VI-NEXT: v_writelane_b32 v22, s59, 50 -; VI-NEXT: s_lshr_b32 s59, s72, 16 -; VI-NEXT: v_writelane_b32 v22, s59, 51 -; VI-NEXT: s_lshr_b32 s59, s72, 8 -; VI-NEXT: v_writelane_b32 v22, s59, 52 -; VI-NEXT: s_lshr_b32 s59, s8, 16 -; VI-NEXT: v_writelane_b32 v22, s59, 53 -; VI-NEXT: s_lshr_b32 s59, s8, 8 -; VI-NEXT: v_writelane_b32 v22, s59, 54 -; VI-NEXT: s_lshr_b32 s59, s62, 24 -; VI-NEXT: v_writelane_b32 v22, s59, 55 -; VI-NEXT: s_lshr_b32 s59, s62, 16 -; VI-NEXT: v_writelane_b32 v22, s59, 56 -; VI-NEXT: s_lshr_b32 s59, s62, 8 -; VI-NEXT: v_writelane_b32 v22, s59, 57 -; VI-NEXT: s_lshr_b32 s59, s10, 16 -; VI-NEXT: v_writelane_b32 v22, s59, 58 -; VI-NEXT: s_lshr_b32 s59, s10, 8 -; VI-NEXT: v_writelane_b32 v22, s59, 59 -; VI-NEXT: s_lshr_b32 s59, s60, 24 -; VI-NEXT: v_writelane_b32 v22, s59, 60 -; VI-NEXT: s_lshr_b32 s59, s60, 16 -; VI-NEXT: v_writelane_b32 v22, s59, 61 -; VI-NEXT: s_lshr_b32 s59, s60, 8 -; VI-NEXT: v_writelane_b32 v22, s59, 62 -; VI-NEXT: s_lshr_b32 s59, s12, 16 +; VI-NEXT: s_lshr_b32 s31, s18, 16 +; VI-NEXT: s_lshr_b64 s[18:19], s[30:31], 16 +; VI-NEXT: s_mov_b32 s19, s90 +; VI-NEXT: s_lshr_b64 vcc, s[18:19], 24 +; VI-NEXT: s_mov_b32 s21, s84 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 4 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 5 +; VI-NEXT: s_lshr_b64 vcc, s[20:21], 24 +; VI-NEXT: s_mov_b32 s23, s80 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 2 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 3 +; VI-NEXT: s_lshr_b64 vcc, s[22:23], 24 ; VI-NEXT: s_mov_b32 s5, s76 -; VI-NEXT: v_writelane_b32 v22, s59, 63 -; VI-NEXT: s_lshr_b32 s59, s12, 8 -; VI-NEXT: v_writelane_b32 v21, s59, 0 -; VI-NEXT: s_lshr_b32 s59, s58, 24 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 0 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 1 ; VI-NEXT: s_lshr_b64 vcc, s[4:5], 24 ; VI-NEXT: s_mov_b32 s7, s74 -; VI-NEXT: v_writelane_b32 v21, s59, 1 -; VI-NEXT: s_lshr_b32 s59, s58, 16 -; VI-NEXT: v_writelane_b32 v22, vcc_lo, 8 -; VI-NEXT: v_writelane_b32 v21, s59, 2 -; VI-NEXT: s_lshr_b32 s59, s58, 8 -; VI-NEXT: v_writelane_b32 v22, vcc_hi, 9 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 20 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 21 ; VI-NEXT: s_lshr_b64 vcc, s[6:7], 24 ; VI-NEXT: s_mov_b32 s9, s72 -; VI-NEXT: v_writelane_b32 v21, s59, 3 -; VI-NEXT: s_lshr_b32 s59, s14, 16 -; VI-NEXT: v_writelane_b32 v22, vcc_lo, 6 -; VI-NEXT: v_writelane_b32 v21, s59, 4 -; VI-NEXT: s_lshr_b32 s59, s14, 8 -; VI-NEXT: v_writelane_b32 v22, vcc_hi, 7 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 18 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 19 ; VI-NEXT: s_lshr_b64 vcc, s[8:9], 24 ; VI-NEXT: s_mov_b32 s11, s62 -; VI-NEXT: v_writelane_b32 v21, s59, 5 -; VI-NEXT: s_lshr_b32 s59, s56, 24 -; VI-NEXT: v_writelane_b32 v22, vcc_lo, 4 -; VI-NEXT: v_writelane_b32 v21, s59, 6 -; VI-NEXT: s_lshr_b32 s59, s56, 16 -; VI-NEXT: v_writelane_b32 v22, vcc_hi, 5 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 16 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 17 ; VI-NEXT: s_lshr_b64 vcc, s[10:11], 24 ; VI-NEXT: s_mov_b32 s13, s60 -; VI-NEXT: v_writelane_b32 v21, s59, 7 -; VI-NEXT: s_lshr_b32 s59, s56, 8 -; VI-NEXT: v_writelane_b32 v22, vcc_lo, 2 -; VI-NEXT: s_mov_b32 s41, s46 -; VI-NEXT: s_mov_b32 s17, s56 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 14 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 15 +; VI-NEXT: s_lshr_b64 vcc, s[12:13], 24 ; VI-NEXT: s_mov_b32 s15, s58 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 12 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 13 +; VI-NEXT: s_lshr_b64 vcc, s[14:15], 24 +; VI-NEXT: s_mov_b32 s17, s56 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 10 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 11 +; VI-NEXT: s_lshr_b64 vcc, s[16:17], 24 +; VI-NEXT: s_mov_b32 s41, s46 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 8 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 9 +; VI-NEXT: s_lshr_b64 vcc, s[40:41], 24 +; VI-NEXT: v_writelane_b32 v22, vcc_lo, 6 +; VI-NEXT: v_writelane_b32 v22, vcc_hi, 7 +; VI-NEXT: s_lshr_b32 s5, s84, 24 +; VI-NEXT: v_writelane_b32 v22, s5, 22 +; VI-NEXT: s_lshr_b32 s5, s84, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 23 +; VI-NEXT: s_lshr_b32 s5, s84, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 24 +; VI-NEXT: s_lshr_b32 s5, s20, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 25 +; VI-NEXT: s_lshr_b32 s5, s20, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 26 +; VI-NEXT: s_lshr_b32 s5, s80, 24 +; VI-NEXT: v_writelane_b32 v22, s5, 27 +; VI-NEXT: s_lshr_b32 s5, s80, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 28 +; VI-NEXT: s_lshr_b32 s5, s80, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 29 +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 30 +; VI-NEXT: s_lshr_b32 s5, s22, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 31 +; VI-NEXT: s_lshr_b32 s5, s64, 24 +; VI-NEXT: v_writelane_b32 v22, s5, 32 +; VI-NEXT: s_lshr_b32 s5, s64, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 33 +; VI-NEXT: s_lshr_b32 s5, s64, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 34 +; VI-NEXT: s_lshr_b32 s5, s24, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 35 +; VI-NEXT: s_lshr_b32 s5, s24, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 36 +; VI-NEXT: s_lshr_b32 s5, s50, 24 +; VI-NEXT: v_writelane_b32 v22, s5, 37 +; VI-NEXT: s_lshr_b32 s5, s50, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 38 +; VI-NEXT: s_lshr_b32 s5, s50, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 39 +; VI-NEXT: s_lshr_b32 s5, s26, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 40 +; VI-NEXT: s_lshr_b32 s5, s76, 24 +; VI-NEXT: v_writelane_b32 v22, s5, 41 +; VI-NEXT: s_lshr_b32 s5, s76, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 42 +; VI-NEXT: s_lshr_b32 s5, s76, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 43 +; VI-NEXT: s_lshr_b32 s5, s4, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 44 +; VI-NEXT: s_lshr_b32 s5, s4, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 45 +; VI-NEXT: s_lshr_b32 s5, s74, 24 +; VI-NEXT: v_writelane_b32 v22, s5, 46 +; VI-NEXT: s_lshr_b32 s5, s74, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 47 +; VI-NEXT: s_lshr_b32 s5, s74, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 48 +; VI-NEXT: s_lshr_b32 s5, s6, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 49 +; VI-NEXT: s_lshr_b32 s5, s6, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 50 +; VI-NEXT: s_lshr_b32 s5, s72, 24 +; VI-NEXT: v_writelane_b32 v22, s5, 51 +; VI-NEXT: s_lshr_b32 s5, s72, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 52 +; VI-NEXT: s_lshr_b32 s5, s72, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 53 +; VI-NEXT: s_lshr_b32 s5, s8, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 54 +; VI-NEXT: s_lshr_b32 s5, s8, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 55 +; VI-NEXT: s_lshr_b32 s5, s62, 24 +; VI-NEXT: v_writelane_b32 v22, s5, 56 +; VI-NEXT: s_lshr_b32 s5, s62, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 57 +; VI-NEXT: s_lshr_b32 s5, s62, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 58 +; VI-NEXT: s_lshr_b32 s5, s10, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 59 +; VI-NEXT: s_lshr_b32 s5, s10, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 60 +; VI-NEXT: s_lshr_b32 s5, s60, 24 +; VI-NEXT: v_writelane_b32 v22, s5, 61 +; VI-NEXT: s_lshr_b32 s5, s60, 16 +; VI-NEXT: v_writelane_b32 v22, s5, 62 +; VI-NEXT: s_lshr_b32 s5, s60, 8 +; VI-NEXT: v_writelane_b32 v22, s5, 63 +; VI-NEXT: s_lshr_b32 s5, s12, 16 +; VI-NEXT: v_writelane_b32 v21, s5, 0 +; VI-NEXT: s_lshr_b32 s5, s12, 8 ; VI-NEXT: s_mov_b32 s45, s78 ; VI-NEXT: s_mov_b32 s43, s88 -; VI-NEXT: s_mov_b32 s29, s90 -; VI-NEXT: s_mov_b32 s27, s30 -; VI-NEXT: s_mov_b32 s25, s36 -; VI-NEXT: s_mov_b32 s23, s48 -; VI-NEXT: s_mov_b32 s21, s52 -; VI-NEXT: s_mov_b32 s19, s64 -; VI-NEXT: v_writelane_b32 v21, s59, 8 -; VI-NEXT: s_lshr_b32 s59, s16, 16 -; VI-NEXT: v_writelane_b32 v22, vcc_hi, 3 -; VI-NEXT: s_lshr_b64 vcc, s[12:13], 24 -; VI-NEXT: s_lshr_b32 s47, s88, 24 -; VI-NEXT: s_lshr_b32 s57, s88, 16 -; VI-NEXT: s_lshr_b32 s61, s88, 8 -; VI-NEXT: s_lshr_b32 s75, s42, 16 -; VI-NEXT: s_lshr_b32 s79, s42, 8 -; VI-NEXT: s_lshr_b32 s89, s78, 24 -; VI-NEXT: s_lshr_b32 s91, s78, 16 -; VI-NEXT: s_lshr_b32 s31, s78, 8 -; VI-NEXT: s_lshr_b32 s37, s44, 16 -; VI-NEXT: s_lshr_b32 s49, s44, 8 -; VI-NEXT: v_writelane_b32 v21, s59, 9 -; VI-NEXT: s_lshr_b32 s59, s16, 8 -; VI-NEXT: s_lshr_b32 s63, s46, 24 -; VI-NEXT: s_lshr_b32 s73, s46, 16 -; VI-NEXT: s_lshr_b32 s77, s46, 8 -; VI-NEXT: s_lshr_b32 s53, s40, 16 -; VI-NEXT: s_lshr_b32 s65, s40, 8 -; VI-NEXT: s_lshr_b64 s[80:81], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[82:83], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[86:87], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[50:51], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[54:55], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[66:67], s[44:45], 24 -; VI-NEXT: v_writelane_b32 v22, vcc_lo, 0 -; VI-NEXT: s_lshr_b64 s[68:69], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[70:71], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[84:85], s[40:41], 24 -; VI-NEXT: v_writelane_b32 v22, vcc_hi, 1 +; VI-NEXT: s_mov_b32 s29, s34 +; VI-NEXT: s_mov_b32 s27, s50 +; VI-NEXT: s_mov_b32 s25, s64 +; VI-NEXT: v_writelane_b32 v21, s5, 1 +; VI-NEXT: s_lshr_b32 s5, s58, 24 +; VI-NEXT: s_lshr_b64 s[86:87], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24 +; VI-NEXT: v_writelane_b32 v21, s5, 2 +; VI-NEXT: s_lshr_b32 s5, s58, 16 +; VI-NEXT: s_lshr_b32 s31, s90, 24 +; VI-NEXT: s_lshr_b32 s37, s90, 16 +; VI-NEXT: s_lshr_b32 s39, s90, 8 +; VI-NEXT: s_lshr_b32 s49, s18, 16 +; VI-NEXT: s_lshr_b32 s87, s18, 8 +; VI-NEXT: s_lshr_b32 s47, s26, 8 +; VI-NEXT: s_lshr_b32 s67, s34, 24 +; VI-NEXT: s_lshr_b32 s57, s34, 16 +; VI-NEXT: s_lshr_b32 s69, s34, 8 +; VI-NEXT: s_lshr_b32 s63, s28, 16 +; VI-NEXT: s_lshr_b32 s75, s28, 8 +; VI-NEXT: s_lshr_b32 s53, s88, 24 +; VI-NEXT: s_lshr_b32 s71, s88, 16 +; VI-NEXT: s_lshr_b32 s77, s88, 8 +; VI-NEXT: s_lshr_b32 s91, s42, 16 +; VI-NEXT: s_lshr_b32 s83, s42, 8 +; VI-NEXT: s_lshr_b32 s54, s78, 24 +; VI-NEXT: s_lshr_b32 s51, s78, 16 +; VI-NEXT: s_lshr_b32 s65, s78, 8 +; VI-NEXT: s_lshr_b32 s81, s44, 16 +; VI-NEXT: s_lshr_b32 s85, s44, 8 +; VI-NEXT: v_writelane_b32 v21, s5, 3 +; VI-NEXT: s_lshr_b32 s5, s58, 8 +; VI-NEXT: s_lshr_b32 s66, s14, 16 +; VI-NEXT: s_lshr_b32 s68, s14, 8 +; VI-NEXT: s_lshr_b32 s59, s56, 24 +; VI-NEXT: s_lshr_b32 s61, s56, 16 +; VI-NEXT: s_lshr_b32 s73, s56, 8 +; VI-NEXT: s_lshr_b32 s52, s16, 16 +; VI-NEXT: s_lshr_b32 s70, s16, 8 +; VI-NEXT: s_lshr_b32 s79, s46, 24 +; VI-NEXT: s_lshr_b32 s89, s46, 16 +; VI-NEXT: s_lshr_b32 s82, s46, 8 +; VI-NEXT: s_lshr_b32 s35, s40, 16 +; VI-NEXT: s_lshr_b32 s55, s40, 8 +; VI-NEXT: v_writelane_b32 v21, s5, 4 ; VI-NEXT: .LBB91_3: ; %end ; VI-NEXT: s_and_b32 s5, s44, 0xff -; VI-NEXT: s_lshl_b32 s7, s49, 8 +; VI-NEXT: s_lshl_b32 s7, s85, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_lshl_b32 s7, s66, 8 -; VI-NEXT: s_and_b32 s9, s37, 0xff -; VI-NEXT: s_or_b32 s7, s9, s7 +; VI-NEXT: s_and_b32 s7, s81, 0xff +; VI-NEXT: s_lshl_b32 s9, s48, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_and_b32 s5, s78, 0xff -; VI-NEXT: s_lshl_b32 s7, s31, 8 +; VI-NEXT: s_lshl_b32 s7, s65, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s91, 0xff -; VI-NEXT: s_lshl_b32 s9, s89, 8 +; VI-NEXT: s_and_b32 s7, s51, 0xff +; VI-NEXT: s_lshl_b32 s9, s54, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s42, 0xff -; VI-NEXT: s_lshl_b32 s7, s79, 8 +; VI-NEXT: s_lshl_b32 s7, s83, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_lshl_b32 s7, s54, 8 -; VI-NEXT: s_and_b32 s9, s75, 0xff -; VI-NEXT: s_or_b32 s7, s9, s7 +; VI-NEXT: s_and_b32 s7, s91, 0xff +; VI-NEXT: s_lshl_b32 s9, s38, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_and_b32 s5, s88, 0xff -; VI-NEXT: s_lshl_b32 s7, s61, 8 +; VI-NEXT: s_lshl_b32 s7, s77, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s57, 0xff -; VI-NEXT: s_lshl_b32 s9, s47, 8 +; VI-NEXT: s_and_b32 s7, s71, 0xff +; VI-NEXT: s_lshl_b32 s9, s53, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 39 ; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: s_and_b32 s5, s28, 0xff -; VI-NEXT: s_lshl_b32 s7, s7, 8 -; VI-NEXT: v_readlane_b32 s9, v22, 38 +; VI-NEXT: s_lshl_b32 s7, s75, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_lshl_b32 s7, s50, 8 -; VI-NEXT: s_and_b32 s9, s9, 0xff -; VI-NEXT: s_or_b32 s7, s9, s7 +; VI-NEXT: s_and_b32 s7, s63, 0xff +; VI-NEXT: s_lshl_b32 s9, s36, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 37 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_and_b32 s5, s90, 0xff -; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_and_b32 s5, s34, 0xff +; VI-NEXT: s_lshl_b32 s7, s69, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 36 -; VI-NEXT: v_readlane_b32 s9, v22, 35 -; VI-NEXT: s_and_b32 s7, s7, 0xff -; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_and_b32 s7, s57, 0xff +; VI-NEXT: s_lshl_b32 s9, s67, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 34 ; VI-NEXT: v_mov_b32_e32 v6, s5 ; VI-NEXT: s_and_b32 s5, s26, 0xff -; VI-NEXT: s_lshl_b32 s7, s7, 8 -; VI-NEXT: v_readlane_b32 s9, v22, 33 +; VI-NEXT: s_lshl_b32 s7, s47, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_lshl_b32 s7, s38, 8 -; VI-NEXT: s_and_b32 s9, s9, 0xff -; VI-NEXT: s_or_b32 s7, s9, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 40 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s30, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 32 +; VI-NEXT: v_readlane_b32 s7, v22, 39 ; VI-NEXT: v_mov_b32_e32 v7, s5 -; VI-NEXT: s_and_b32 s5, s30, 0xff +; VI-NEXT: s_and_b32 s5, s50, 0xff ; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 31 -; VI-NEXT: v_readlane_b32 s9, v22, 30 +; VI-NEXT: v_readlane_b32 s7, v22, 38 +; VI-NEXT: v_readlane_b32 s9, v22, 37 ; VI-NEXT: s_and_b32 s7, s7, 0xff ; VI-NEXT: s_lshl_b32 s9, s9, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 29 +; VI-NEXT: v_readlane_b32 s7, v22, 36 ; VI-NEXT: v_mov_b32_e32 v8, s5 ; VI-NEXT: s_and_b32 s5, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s7, 8 -; VI-NEXT: v_readlane_b32 s9, v22, 28 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_lshl_b32 s7, s34, 8 -; VI-NEXT: s_and_b32 s9, s9, 0xff -; VI-NEXT: s_or_b32 s7, s9, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 35 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s86, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 27 +; VI-NEXT: v_readlane_b32 s7, v22, 34 ; VI-NEXT: v_mov_b32_e32 v9, s5 -; VI-NEXT: s_and_b32 s5, s36, 0xff +; VI-NEXT: s_and_b32 s5, s64, 0xff ; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 26 -; VI-NEXT: v_readlane_b32 s9, v22, 25 +; VI-NEXT: v_readlane_b32 s7, v22, 33 +; VI-NEXT: v_readlane_b32 s9, v22, 32 ; VI-NEXT: s_and_b32 s7, s7, 0xff ; VI-NEXT: s_lshl_b32 s9, s9, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 24 +; VI-NEXT: v_readlane_b32 s7, v22, 31 ; VI-NEXT: v_mov_b32_e32 v10, s5 ; VI-NEXT: s_and_b32 s5, s22, 0xff ; VI-NEXT: s_lshl_b32 s7, s7, 8 -; VI-NEXT: v_readlane_b32 s9, v22, 23 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_lshl_b32 s7, s86, 8 -; VI-NEXT: s_and_b32 s9, s9, 0xff -; VI-NEXT: s_or_b32 s7, s9, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 30 +; VI-NEXT: v_readlane_b32 s22, v22, 0 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s22, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 22 +; VI-NEXT: v_readlane_b32 s7, v22, 29 ; VI-NEXT: v_mov_b32_e32 v11, s5 -; VI-NEXT: s_and_b32 s5, s48, 0xff +; VI-NEXT: s_and_b32 s5, s80, 0xff ; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 21 -; VI-NEXT: v_readlane_b32 s9, v22, 20 +; VI-NEXT: v_readlane_b32 s7, v22, 28 +; VI-NEXT: v_readlane_b32 s9, v22, 27 ; VI-NEXT: s_and_b32 s7, s7, 0xff ; VI-NEXT: s_lshl_b32 s9, s9, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 19 +; VI-NEXT: v_readlane_b32 s7, v22, 26 ; VI-NEXT: v_mov_b32_e32 v12, s5 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s7, s7, 8 -; VI-NEXT: v_readlane_b32 s9, v22, 18 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_lshl_b32 s7, s82, 8 -; VI-NEXT: s_and_b32 s9, s9, 0xff -; VI-NEXT: s_or_b32 s7, s9, s7 +; VI-NEXT: v_readlane_b32 s7, v22, 25 +; VI-NEXT: v_readlane_b32 s20, v22, 2 +; VI-NEXT: s_and_b32 s7, s7, 0xff +; VI-NEXT: s_lshl_b32 s9, s20, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 17 +; VI-NEXT: v_readlane_b32 s7, v22, 24 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 ; VI-NEXT: v_mov_b32_e32 v13, s5 -; VI-NEXT: s_and_b32 s5, s52, 0xff +; VI-NEXT: s_and_b32 s5, s84, 0xff ; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 16 -; VI-NEXT: v_readlane_b32 s9, v22, 15 +; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: v_readlane_b32 s7, v22, 23 +; VI-NEXT: v_readlane_b32 s9, v22, 22 +; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 ; VI-NEXT: s_and_b32 s7, s7, 0xff ; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 ; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 14 -; VI-NEXT: v_mov_b32_e32 v14, s5 +; VI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s7, s7, 8 -; VI-NEXT: v_readlane_b32 s9, v22, 13 +; VI-NEXT: s_lshl_b32 s7, s87, 8 +; VI-NEXT: v_readlane_b32 s18, v22, 4 +; VI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_lshl_b32 s7, s80, 8 -; VI-NEXT: s_and_b32 s9, s9, 0xff -; VI-NEXT: s_or_b32 s7, s9, s7 +; VI-NEXT: s_and_b32 s7, s49, 0xff +; VI-NEXT: s_lshl_b32 s9, s18, 8 +; VI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 12 -; VI-NEXT: v_mov_b32_e32 v15, s5 -; VI-NEXT: s_and_b32 s5, s64, 0xff -; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s90, 0xff +; VI-NEXT: s_lshl_b32 s7, s39, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 11 -; VI-NEXT: v_readlane_b32 s9, v22, 10 -; VI-NEXT: s_and_b32 s7, s7, 0xff -; VI-NEXT: s_lshl_b32 s9, s9, 8 +; VI-NEXT: s_and_b32 s7, s37, 0xff +; VI-NEXT: s_lshl_b32 s9, s31, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_mov_b32_e32 v16, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s40, 0xff -; VI-NEXT: s_lshl_b32 s7, s65, 8 +; VI-NEXT: s_lshl_b32 s7, s55, 8 +; VI-NEXT: v_readlane_b32 s18, v22, 6 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_lshl_b32 s7, s84, 8 -; VI-NEXT: s_and_b32 s9, s53, 0xff -; VI-NEXT: s_or_b32 s7, s9, s7 +; VI-NEXT: s_and_b32 s7, s35, 0xff +; VI-NEXT: s_lshl_b32 s9, s18, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 -; VI-NEXT: v_mov_b32_e32 v17, s5 -; VI-NEXT: s_and_b32 s5, s46, 0xff -; VI-NEXT: s_lshl_b32 s7, s77, 8 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_and_b32 s5, s46, 0xff +; VI-NEXT: s_lshl_b32 s7, s82, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_and_b32 s7, s73, 0xff -; VI-NEXT: s_lshl_b32 s9, s63, 8 -; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_and_b32 s7, s89, 0xff +; VI-NEXT: s_lshl_b32 s9, s79, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 64, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; VI-NEXT: v_mov_b32_e32 v18, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s16, 0xff -; VI-NEXT: s_lshl_b32 s7, s59, 8 -; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_lshl_b32 s7, s70, 8 +; VI-NEXT: v_readlane_b32 s16, v22, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; VI-NEXT: v_readlane_b32 s7, v21, 9 -; VI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; VI-NEXT: s_and_b32 s7, s7, 0xff -; VI-NEXT: s_lshl_b32 s9, s70, 8 -; VI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: s_and_b32 s7, s52, 0xff +; VI-NEXT: s_lshl_b32 s9, s16, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v21, 8 -; VI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s56, 0xff -; VI-NEXT: s_lshl_b32 s7, s7, 8 -; VI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_lshl_b32 s7, s73, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v21, 7 -; VI-NEXT: v_readlane_b32 s9, v21, 6 -; VI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v0 -; VI-NEXT: s_and_b32 s7, s7, 0xff -; VI-NEXT: s_lshl_b32 s9, s9, 8 -; VI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 64, v0 +; VI-NEXT: s_and_b32 s7, s61, 0xff +; VI-NEXT: s_lshl_b32 s9, s59, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 -; VI-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x48, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v21, 5 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s14, 0xff -; VI-NEXT: s_lshl_b32 s7, s7, 8 +; VI-NEXT: s_lshl_b32 s7, s68, 8 +; VI-NEXT: v_readlane_b32 s14, v22, 10 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v21, 4 -; VI-NEXT: s_and_b32 s7, s7, 0xff -; VI-NEXT: s_lshl_b32 s9, s68, 8 +; VI-NEXT: s_and_b32 s7, s66, 0xff +; VI-NEXT: s_lshl_b32 s9, s14, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x4c, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v21, 3 +; VI-NEXT: v_readlane_b32 s7, v21, 4 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s58, 0xff ; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v21, 2 -; VI-NEXT: v_readlane_b32 s9, v21, 1 +; VI-NEXT: v_readlane_b32 s7, v21, 3 +; VI-NEXT: v_readlane_b32 s9, v21, 2 ; VI-NEXT: s_and_b32 s7, s7, 0xff ; VI-NEXT: s_lshl_b32 s9, s9, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 @@ -170829,14 +170167,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x50, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v21, 0 +; VI-NEXT: v_readlane_b32 s7, v21, 1 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s12, 0xff ; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 63 -; VI-NEXT: v_readlane_b32 s12, v22, 0 +; VI-NEXT: v_readlane_b32 s7, v21, 0 +; VI-NEXT: v_readlane_b32 s12, v22, 12 ; VI-NEXT: s_and_b32 s7, s7, 0xff ; VI-NEXT: s_lshl_b32 s9, s12, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 @@ -170844,14 +170182,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 62 +; VI-NEXT: v_readlane_b32 s7, v22, 63 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s60, 0xff ; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 61 -; VI-NEXT: v_readlane_b32 s9, v22, 60 +; VI-NEXT: v_readlane_b32 s7, v22, 62 +; VI-NEXT: v_readlane_b32 s9, v22, 61 ; VI-NEXT: s_and_b32 s7, s7, 0xff ; VI-NEXT: s_lshl_b32 s9, s9, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 @@ -170859,14 +170197,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x58, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 59 +; VI-NEXT: v_readlane_b32 s7, v22, 60 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s10, 0xff ; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 58 -; VI-NEXT: v_readlane_b32 s10, v22, 2 +; VI-NEXT: v_readlane_b32 s7, v22, 59 +; VI-NEXT: v_readlane_b32 s10, v22, 14 ; VI-NEXT: s_and_b32 s7, s7, 0xff ; VI-NEXT: s_lshl_b32 s9, s10, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 @@ -170874,14 +170212,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 57 +; VI-NEXT: v_readlane_b32 s7, v22, 58 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s62, 0xff ; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 56 -; VI-NEXT: v_readlane_b32 s9, v22, 55 +; VI-NEXT: v_readlane_b32 s7, v22, 57 +; VI-NEXT: v_readlane_b32 s9, v22, 56 ; VI-NEXT: s_and_b32 s7, s7, 0xff ; VI-NEXT: s_lshl_b32 s9, s9, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 @@ -170889,14 +170227,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 54 +; VI-NEXT: v_readlane_b32 s7, v22, 55 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s8, 0xff ; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 53 -; VI-NEXT: v_readlane_b32 s8, v22, 4 +; VI-NEXT: v_readlane_b32 s7, v22, 54 +; VI-NEXT: v_readlane_b32 s8, v22, 16 ; VI-NEXT: s_and_b32 s7, s7, 0xff ; VI-NEXT: s_lshl_b32 s8, s8, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 @@ -170904,14 +170242,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v0 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 52 +; VI-NEXT: v_readlane_b32 s7, v22, 53 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s72, 0xff ; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: v_readlane_b32 s7, v22, 51 -; VI-NEXT: v_readlane_b32 s8, v22, 50 +; VI-NEXT: v_readlane_b32 s7, v22, 52 +; VI-NEXT: v_readlane_b32 s8, v22, 51 ; VI-NEXT: s_and_b32 s7, s7, 0xff ; VI-NEXT: s_lshl_b32 s8, s8, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 @@ -170922,11 +170260,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s6, 0xff -; VI-NEXT: v_readlane_b32 s6, v22, 49 +; VI-NEXT: v_readlane_b32 s6, v22, 50 ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_readlane_b32 s6, v22, 48 -; VI-NEXT: v_readlane_b32 s8, v22, 6 +; VI-NEXT: v_readlane_b32 s6, v22, 49 +; VI-NEXT: v_readlane_b32 s8, v22, 18 ; VI-NEXT: s_and_b32 s6, s6, 0xff ; VI-NEXT: s_lshl_b32 s7, s8, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 @@ -170934,14 +170272,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_readlane_b32 s6, v22, 47 +; VI-NEXT: v_readlane_b32 s6, v22, 48 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s74, 0xff ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_readlane_b32 s6, v22, 46 -; VI-NEXT: v_readlane_b32 s7, v22, 45 +; VI-NEXT: v_readlane_b32 s6, v22, 47 +; VI-NEXT: v_readlane_b32 s7, v22, 46 ; VI-NEXT: s_and_b32 s6, s6, 0xff ; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 @@ -170951,12 +170289,12 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_readlane_b32 s5, v22, 44 +; VI-NEXT: v_readlane_b32 s5, v22, 45 ; VI-NEXT: s_and_b32 s4, s4, 0xff ; VI-NEXT: s_lshl_b32 s5, s5, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_readlane_b32 s5, v22, 43 -; VI-NEXT: v_readlane_b32 s6, v22, 8 +; VI-NEXT: v_readlane_b32 s5, v22, 44 +; VI-NEXT: v_readlane_b32 s6, v22, 20 ; VI-NEXT: s_and_b32 s5, s5, 0xff ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 @@ -170964,14 +170302,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v0 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_readlane_b32 s5, v22, 42 +; VI-NEXT: v_readlane_b32 s5, v22, 43 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_and_b32 s4, s76, 0xff ; VI-NEXT: s_lshl_b32 s5, s5, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_readlane_b32 s5, v22, 41 -; VI-NEXT: v_readlane_b32 s6, v22, 40 +; VI-NEXT: v_readlane_b32 s5, v22, 42 +; VI-NEXT: v_readlane_b32 s6, v22, 41 ; VI-NEXT: s_and_b32 s5, s5, 0xff ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 @@ -170979,14 +170317,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v0 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_readlane_b32 s9, v22, 5 +; VI-NEXT: v_readlane_b32 s19, v22, 5 +; VI-NEXT: v_readlane_b32 s9, v22, 17 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_readlane_b32 s13, v22, 1 -; VI-NEXT: v_readlane_b32 s11, v22, 3 -; VI-NEXT: v_readlane_b32 s9, v22, 7 -; VI-NEXT: v_readlane_b32 s7, v22, 9 +; VI-NEXT: v_readlane_b32 s23, v22, 1 +; VI-NEXT: v_readlane_b32 s21, v22, 3 +; VI-NEXT: v_readlane_b32 s19, v22, 7 +; VI-NEXT: v_readlane_b32 s17, v22, 9 +; VI-NEXT: v_readlane_b32 s15, v22, 11 +; VI-NEXT: v_readlane_b32 s13, v22, 13 +; VI-NEXT: v_readlane_b32 s11, v22, 15 +; VI-NEXT: v_readlane_b32 s9, v22, 19 +; VI-NEXT: v_readlane_b32 s7, v22, 21 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_readlane_b32 s87, v20, 31 ; VI-NEXT: v_readlane_b32 s86, v20, 30 @@ -171030,93 +170374,125 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: .LBB91_4: ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: v_writelane_b32 v22, s60, 0 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr61 -; VI-NEXT: v_writelane_b32 v22, s61, 1 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: v_writelane_b32 v22, s62, 2 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr63 -; VI-NEXT: v_writelane_b32 v22, s63, 3 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; kill: killed $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr59 ; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: v_writelane_b32 v22, s72, 4 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr73 -; VI-NEXT: v_writelane_b32 v22, s73, 5 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; kill: killed $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: v_writelane_b32 v22, s74, 6 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr75 -; VI-NEXT: v_writelane_b32 v22, s75, 7 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; kill: killed $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr63 ; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: v_writelane_b32 v22, s76, 8 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr49 -; VI-NEXT: ; implicit-def: $sgpr37 -; VI-NEXT: ; implicit-def: $sgpr31 -; VI-NEXT: ; implicit-def: $sgpr91 -; VI-NEXT: ; implicit-def: $sgpr89 -; VI-NEXT: ; implicit-def: $sgpr79 -; VI-NEXT: ; implicit-def: $sgpr57 -; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr53 -; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr59 -; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; kill: killed $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $vcc_lo +; VI-NEXT: ; kill: killed $vcc_lo +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr48 ; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr51 ; VI-NEXT: ; implicit-def: $sgpr54 -; VI-NEXT: ; implicit-def: $sgpr88 -; VI-NEXT: ; implicit-def: $sgpr50 -; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr91 ; VI-NEXT: ; implicit-def: $sgpr38 -; VI-NEXT: ; implicit-def: $sgpr30 -; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr53 ; VI-NEXT: ; implicit-def: $sgpr36 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr50 ; VI-NEXT: ; implicit-def: $sgpr86 -; VI-NEXT: ; implicit-def: $sgpr48 -; VI-NEXT: ; implicit-def: $sgpr82 -; VI-NEXT: ; implicit-def: $sgpr52 -; VI-NEXT: ; implicit-def: $sgpr80 ; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr80 ; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr79 ; VI-NEXT: ; implicit-def: $sgpr70 -; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr68 -; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; kill: killed $sgpr62 ; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; kill: killed $sgpr72 ; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: v_writelane_b32 v22, s77, 9 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; kill: killed $sgpr74 ; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; kill: killed $sgpr76 +; VI-NEXT: ; implicit-def: $vcc_lo +; VI-NEXT: ; kill: killed $vcc_lo ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; kill: killed $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; kill: killed $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; kill: killed $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; kill: killed $sgpr76 +; VI-NEXT: ; implicit-def: $vcc_lo +; VI-NEXT: ; kill: killed $vcc_lo ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; kill: killed $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; kill: killed $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; kill: killed $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; kill: killed $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; kill: killed $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; kill: killed $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; kill: killed $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; kill: killed $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; kill: killed $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr46 @@ -171128,71 +170504,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: v_writelane_b32 v22, s46, 0 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: v_writelane_b32 v22, s47, 1 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr46 @@ -171202,18 +170517,42 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: v_writelane_b32 v22, s46, 2 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: v_writelane_b32 v22, s47, 3 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: v_writelane_b32 v22, s46, 4 +; VI-NEXT: v_writelane_b32 v22, s47, 5 ; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: v_writelane_b32 v22, s46, 6 +; VI-NEXT: v_writelane_b32 v22, s47, 7 +; VI-NEXT: v_writelane_b32 v22, s56, 8 +; VI-NEXT: v_writelane_b32 v22, s57, 9 +; VI-NEXT: v_writelane_b32 v22, s58, 10 +; VI-NEXT: v_writelane_b32 v22, s59, 11 +; VI-NEXT: v_writelane_b32 v22, s60, 12 +; VI-NEXT: v_writelane_b32 v22, s61, 13 +; VI-NEXT: v_writelane_b32 v22, s62, 14 +; VI-NEXT: v_writelane_b32 v22, s63, 15 +; VI-NEXT: v_writelane_b32 v22, s72, 16 +; VI-NEXT: v_writelane_b32 v22, s73, 17 +; VI-NEXT: v_writelane_b32 v22, s74, 18 +; VI-NEXT: v_writelane_b32 v22, s75, 19 +; VI-NEXT: v_writelane_b32 v22, s76, 20 ; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: v_writelane_b32 v22, s77, 21 +; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: s_branch .LBB91_2 ; ; GFX9-LABEL: bitcast_v64bf16_to_v128i8_scalar: @@ -171311,151 +170650,151 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_cbranch_scc0 .LBB91_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b32 s46, s19, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 14 +; GFX9-NEXT: v_writelane_b32 v21, s46, 10 ; GFX9-NEXT: s_lshr_b32 s46, s19, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 15 +; GFX9-NEXT: v_writelane_b32 v21, s46, 11 ; GFX9-NEXT: s_lshr_b32 s46, s18, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 12 ; GFX9-NEXT: s_lshr_b32 s46, s18, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 17 +; GFX9-NEXT: v_writelane_b32 v21, s46, 13 ; GFX9-NEXT: s_lshr_b32 s46, s21, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 18 +; GFX9-NEXT: v_writelane_b32 v21, s46, 14 ; GFX9-NEXT: s_lshr_b32 s46, s21, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 19 +; GFX9-NEXT: v_writelane_b32 v21, s46, 15 ; GFX9-NEXT: s_lshr_b32 s46, s20, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 20 +; GFX9-NEXT: v_writelane_b32 v21, s46, 16 ; GFX9-NEXT: s_lshr_b32 s46, s20, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 21 +; GFX9-NEXT: v_writelane_b32 v21, s46, 17 ; GFX9-NEXT: s_lshr_b32 s46, s23, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 22 +; GFX9-NEXT: v_writelane_b32 v21, s46, 18 ; GFX9-NEXT: s_lshr_b32 s46, s23, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 13 +; GFX9-NEXT: v_writelane_b32 v21, s46, 9 ; GFX9-NEXT: s_lshr_b32 s46, s23, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 23 +; GFX9-NEXT: v_writelane_b32 v21, s46, 19 ; GFX9-NEXT: s_lshr_b32 s46, s22, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 20 ; GFX9-NEXT: s_lshr_b32 s46, s22, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 25 +; GFX9-NEXT: v_writelane_b32 v21, s46, 21 ; GFX9-NEXT: s_lshr_b32 s46, s25, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 26 +; GFX9-NEXT: v_writelane_b32 v21, s46, 22 ; GFX9-NEXT: s_lshr_b32 s46, s25, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 12 +; GFX9-NEXT: v_writelane_b32 v21, s46, 8 ; GFX9-NEXT: s_lshr_b32 s46, s25, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: v_writelane_b32 v21, s46, 23 ; GFX9-NEXT: s_lshr_b32 s46, s24, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: v_writelane_b32 v21, s46, 24 ; GFX9-NEXT: s_lshr_b32 s46, s24, 8 -; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: v_writelane_b32 v21, s46, 25 ; GFX9-NEXT: s_lshr_b32 s46, s27, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: v_writelane_b32 v21, s46, 26 ; GFX9-NEXT: s_lshr_b32 s46, s27, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 11 +; GFX9-NEXT: v_writelane_b32 v21, s46, 7 ; GFX9-NEXT: s_lshr_b32 s46, s27, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: s_lshr_b32 s46, s5, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s5, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s4, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s4, 8 ; GFX9-NEXT: v_writelane_b32 v21, s46, 31 -; GFX9-NEXT: s_lshr_b32 s46, s29, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 10 -; GFX9-NEXT: s_lshr_b32 s46, s43, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 9 -; GFX9-NEXT: s_lshr_b32 s46, s45, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 8 -; GFX9-NEXT: s_lshr_b32 s46, s5, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 7 -; GFX9-NEXT: s_lshr_b32 s46, s7, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 6 -; GFX9-NEXT: s_lshr_b32 s46, s9, 24 +; GFX9-NEXT: s_lshr_b32 s46, s7, 24 ; GFX9-NEXT: v_writelane_b32 v21, s46, 32 -; GFX9-NEXT: s_lshr_b32 s46, s9, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 5 -; GFX9-NEXT: s_lshr_b32 s46, s9, 8 +; GFX9-NEXT: s_lshr_b32 s46, s7, 8 ; GFX9-NEXT: v_writelane_b32 v21, s46, 33 -; GFX9-NEXT: s_lshr_b32 s46, s8, 16 +; GFX9-NEXT: s_lshr_b32 s46, s6, 16 ; GFX9-NEXT: v_writelane_b32 v21, s46, 34 -; GFX9-NEXT: s_lshr_b32 s46, s8, 8 +; GFX9-NEXT: s_lshr_b32 s46, s6, 8 ; GFX9-NEXT: v_writelane_b32 v21, s46, 35 -; GFX9-NEXT: s_lshr_b32 s46, s11, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 4 -; GFX9-NEXT: s_lshr_b32 s46, s11, 8 +; GFX9-NEXT: s_lshr_b32 s46, s9, 24 ; GFX9-NEXT: v_writelane_b32 v21, s46, 36 -; GFX9-NEXT: s_lshr_b32 s46, s10, 16 +; GFX9-NEXT: s_lshr_b32 s46, s9, 8 ; GFX9-NEXT: v_writelane_b32 v21, s46, 37 -; GFX9-NEXT: s_lshr_b32 s46, s10, 8 +; GFX9-NEXT: s_lshr_b32 s46, s8, 16 ; GFX9-NEXT: v_writelane_b32 v21, s46, 38 -; GFX9-NEXT: s_lshr_b32 s46, s13, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 3 -; GFX9-NEXT: s_lshr_b32 s46, s13, 8 +; GFX9-NEXT: s_lshr_b32 s46, s8, 8 ; GFX9-NEXT: v_writelane_b32 v21, s46, 39 -; GFX9-NEXT: s_lshr_b32 s46, s15, 16 -; GFX9-NEXT: s_lshr_b32 s60, s4, 8 -; GFX9-NEXT: s_lshr_b32 s61, s7, 24 -; GFX9-NEXT: s_lshr_b32 s62, s7, 8 -; GFX9-NEXT: s_lshr_b32 s63, s6, 16 -; GFX9-NEXT: v_writelane_b32 v21, s46, 2 -; GFX9-NEXT: s_lshr_b32 s46, s17, 16 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[26:27], 24 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[28:29], 24 +; GFX9-NEXT: s_lshr_b32 s46, s11, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s10, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s10, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 42 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[18:19], 24 +; GFX9-NEXT: v_writelane_b32 v21, s62, 2 +; GFX9-NEXT: v_writelane_b32 v21, s63, 3 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 +; GFX9-NEXT: v_writelane_b32 v21, s62, 0 +; GFX9-NEXT: s_lshr_b32 s73, s9, 16 +; GFX9-NEXT: s_lshr_b32 s72, s11, 16 +; GFX9-NEXT: s_lshr_b32 s61, s13, 16 +; GFX9-NEXT: s_lshr_b32 s60, s17, 16 +; GFX9-NEXT: v_writelane_b32 v21, s63, 1 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[28:29], 24 +; GFX9-NEXT: s_lshr_b32 s79, s7, 16 +; GFX9-NEXT: s_lshr_b32 s78, s15, 16 +; GFX9-NEXT: s_mov_b32 s63, s60 +; GFX9-NEXT: s_mov_b32 s75, s61 +; GFX9-NEXT: s_mov_b32 s77, s72 +; GFX9-NEXT: s_mov_b32 s93, s73 ; GFX9-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 -; GFX9-NEXT: s_lshr_b32 s47, s5, 24 -; GFX9-NEXT: s_lshr_b32 s56, s5, 8 -; GFX9-NEXT: s_lshr_b32 s57, s4, 16 -; GFX9-NEXT: s_lshr_b32 s74, s6, 8 -; GFX9-NEXT: s_lshr_b32 s75, s11, 24 -; GFX9-NEXT: s_lshr_b32 s78, s13, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 1 -; GFX9-NEXT: s_lshr_b32 s46, s41, 16 -; GFX9-NEXT: s_lshr_b64 s[58:59], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[22:23], 24 -; GFX9-NEXT: s_mov_b32 s91, s60 -; GFX9-NEXT: s_mov_b32 s95, s61 -; GFX9-NEXT: s_mov_b32 s31, s62 -; GFX9-NEXT: s_mov_b32 s35, s63 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 -; GFX9-NEXT: s_lshr_b64 s[60:61], s[8:9], 24 -; GFX9-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 -; GFX9-NEXT: s_lshr_b32 s69, s19, 16 -; GFX9-NEXT: s_lshr_b32 s68, s21, 16 -; GFX9-NEXT: s_lshr_b32 s38, s26, 16 -; GFX9-NEXT: s_lshr_b32 s48, s26, 8 -; GFX9-NEXT: s_lshr_b32 s50, s29, 24 -; GFX9-NEXT: s_lshr_b32 s51, s29, 8 -; GFX9-NEXT: s_lshr_b32 s53, s28, 16 -; GFX9-NEXT: s_lshr_b32 s71, s28, 8 -; GFX9-NEXT: s_lshr_b32 s80, s43, 24 -; GFX9-NEXT: s_lshr_b32 s82, s43, 8 -; GFX9-NEXT: s_lshr_b32 s65, s42, 16 -; GFX9-NEXT: s_lshr_b32 s85, s42, 8 -; GFX9-NEXT: s_lshr_b32 s66, s45, 24 -; GFX9-NEXT: s_lshr_b32 s97, s45, 8 -; GFX9-NEXT: s_lshr_b32 s99, s44, 16 -; GFX9-NEXT: s_lshr_b32 s67, s44, 8 -; GFX9-NEXT: s_lshr_b32 s54, s12, 16 -; GFX9-NEXT: s_lshr_b32 s39, s12, 8 -; GFX9-NEXT: s_lshr_b32 s49, s15, 24 -; GFX9-NEXT: s_lshr_b32 s55, s15, 8 -; GFX9-NEXT: s_lshr_b32 s52, s14, 16 -; GFX9-NEXT: s_lshr_b32 s70, s14, 8 -; GFX9-NEXT: s_lshr_b32 s64, s17, 24 -; GFX9-NEXT: s_lshr_b32 s81, s17, 8 -; GFX9-NEXT: s_lshr_b32 s83, s16, 16 -; GFX9-NEXT: s_lshr_b32 s84, s16, 8 -; GFX9-NEXT: s_lshr_b32 s86, s41, 24 -; GFX9-NEXT: v_writelane_b32 v21, s46, 0 -; GFX9-NEXT: s_lshr_b32 s87, s41, 8 -; GFX9-NEXT: s_lshr_b32 s96, s40, 16 -; GFX9-NEXT: s_lshr_b32 s98, s40, 8 -; GFX9-NEXT: s_mov_b32 s59, s47 -; GFX9-NEXT: s_mov_b32 s73, s56 -; GFX9-NEXT: s_mov_b32 s77, s57 -; GFX9-NEXT: s_mov_b32 s37, s74 -; GFX9-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 -; GFX9-NEXT: s_mov_b32 s61, s75 -; GFX9-NEXT: s_mov_b32 s63, s78 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[16:17], 24 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[8:9], 24 +; GFX9-NEXT: s_lshr_b32 s88, s5, 16 +; GFX9-NEXT: s_lshr_b32 s89, s11, 24 +; GFX9-NEXT: s_mov_b32 s35, s79 +; GFX9-NEXT: s_lshr_b64 s[38:39], s[44:45], 24 +; GFX9-NEXT: v_writelane_b32 v21, s60, 4 +; GFX9-NEXT: s_mov_b32 s73, s78 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[10:11], 24 +; GFX9-NEXT: s_lshr_b32 s85, s19, 16 +; GFX9-NEXT: s_lshr_b32 s84, s21, 16 +; GFX9-NEXT: s_lshr_b32 s55, s26, 16 +; GFX9-NEXT: s_lshr_b32 s64, s26, 8 +; GFX9-NEXT: s_lshr_b32 s71, s29, 24 +; GFX9-NEXT: s_lshr_b32 s70, s29, 16 +; GFX9-NEXT: s_lshr_b32 s68, s29, 8 +; GFX9-NEXT: s_lshr_b32 s86, s28, 16 +; GFX9-NEXT: s_lshr_b32 s87, s28, 8 +; GFX9-NEXT: s_lshr_b32 s97, s43, 24 +; GFX9-NEXT: s_lshr_b32 s54, s43, 16 +; GFX9-NEXT: s_lshr_b32 s48, s43, 8 +; GFX9-NEXT: s_lshr_b32 s53, s42, 16 +; GFX9-NEXT: s_lshr_b32 s47, s42, 8 +; GFX9-NEXT: s_lshr_b32 s99, s45, 24 +; GFX9-NEXT: s_lshr_b32 s51, s45, 16 +; GFX9-NEXT: s_lshr_b32 s50, s45, 8 +; GFX9-NEXT: s_lshr_b32 s83, s44, 16 +; GFX9-NEXT: s_lshr_b32 s57, s44, 8 +; GFX9-NEXT: s_lshr_b32 s65, s13, 24 +; GFX9-NEXT: s_lshr_b32 s66, s13, 8 +; GFX9-NEXT: s_lshr_b32 s67, s12, 16 +; GFX9-NEXT: s_lshr_b32 s69, s12, 8 +; GFX9-NEXT: s_lshr_b32 s80, s15, 24 +; GFX9-NEXT: s_lshr_b32 s96, s15, 8 +; GFX9-NEXT: s_lshr_b32 s98, s14, 16 +; GFX9-NEXT: s_lshr_b32 s81, s14, 8 +; GFX9-NEXT: s_lshr_b32 s46, s17, 24 +; GFX9-NEXT: s_lshr_b32 s49, s17, 8 +; GFX9-NEXT: s_lshr_b32 s52, s16, 16 +; GFX9-NEXT: s_lshr_b32 s82, s16, 8 +; GFX9-NEXT: s_lshr_b32 s56, s41, 24 +; GFX9-NEXT: s_lshr_b32 s36, s41, 16 +; GFX9-NEXT: s_lshr_b32 s37, s41, 8 +; GFX9-NEXT: s_lshr_b32 s58, s40, 16 +; GFX9-NEXT: s_lshr_b32 s59, s40, 8 +; GFX9-NEXT: s_mov_b32 s39, s88 +; GFX9-NEXT: v_writelane_b32 v21, s61, 5 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[6:7], 24 +; GFX9-NEXT: s_mov_b32 s79, s89 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[12:13], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[14:15], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[16:17], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB91_3 ; GFX9-NEXT: .LBB91_2: ; %cmp.true ; GFX9-NEXT: s_and_b32 s46, s41, 0xffff0000 @@ -171471,8 +170810,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s41, s41, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s41, v1 +; GFX9-NEXT: s_lshr_b32 s46, s46, 16 ; GFX9-NEXT: v_readfirstlane_b32 s41, v2 -; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 6 ; GFX9-NEXT: s_bfe_u32 s46, s41, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s41 ; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff @@ -171484,10 +170824,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 ; GFX9-NEXT: v_readfirstlane_b32 s46, v2 ; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 -; GFX9-NEXT: s_lshr_b32 s41, s41, 16 ; GFX9-NEXT: s_add_i32 s47, s47, s46 -; GFX9-NEXT: v_writelane_b32 v21, s57, 0 -; GFX9-NEXT: s_pack_ll_b32_b16 s89, s41, s57 +; GFX9-NEXT: s_lshr_b32 s41, s41, 16 ; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff ; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -171510,7 +170848,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s40, s40, 16 ; GFX9-NEXT: s_add_i32 s47, s47, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s88, s40, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s36, s40, s56 ; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff ; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -171519,7 +170857,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s17, v1 ; GFX9-NEXT: v_readfirstlane_b32 s17, v2 -; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_lshr_b32 s61, s46, 16 ; GFX9-NEXT: s_bfe_u32 s46, s17, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s17 ; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff @@ -171531,10 +170869,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 ; GFX9-NEXT: v_readfirstlane_b32 s46, v2 ; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 -; GFX9-NEXT: s_lshr_b32 s17, s17, 16 ; GFX9-NEXT: s_add_i32 s47, s47, s46 -; GFX9-NEXT: v_writelane_b32 v21, s57, 1 -; GFX9-NEXT: s_pack_ll_b32_b16 s93, s17, s57 +; GFX9-NEXT: s_lshr_b32 s17, s17, 16 ; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff ; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -171557,7 +170893,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s16, s16, 16 ; GFX9-NEXT: s_add_i32 s47, s47, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s92, s16, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s30, s16, s56 ; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff ; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -171566,7 +170902,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_lshl_b32 s15, s15, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s15, v1 ; GFX9-NEXT: v_readfirstlane_b32 s15, v2 -; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_lshr_b32 s89, s46, 16 ; GFX9-NEXT: s_bfe_u32 s46, s15, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s15 ; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff @@ -171578,10 +170914,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 ; GFX9-NEXT: v_readfirstlane_b32 s46, v2 ; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 -; GFX9-NEXT: s_lshr_b32 s15, s15, 16 ; GFX9-NEXT: s_add_i32 s47, s47, s46 -; GFX9-NEXT: v_writelane_b32 v21, s57, 2 -; GFX9-NEXT: s_pack_ll_b32_b16 s79, s15, s57 +; GFX9-NEXT: s_lshr_b32 s15, s15, 16 ; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff ; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -171604,7 +170938,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s14, s14, 16 ; GFX9-NEXT: s_add_i32 s47, s47, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s78, s14, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s94, s14, s56 ; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff ; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -171613,7 +170947,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_lshl_b32 s13, s13, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s13, v1 ; GFX9-NEXT: v_readfirstlane_b32 s13, v2 -; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_lshr_b32 s73, s46, 16 ; GFX9-NEXT: s_bfe_u32 s46, s13, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s13 ; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff @@ -171625,10 +170959,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 ; GFX9-NEXT: v_readfirstlane_b32 s46, v2 ; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 -; GFX9-NEXT: s_lshr_b32 s13, s13, 16 ; GFX9-NEXT: s_add_i32 s47, s47, s46 -; GFX9-NEXT: v_writelane_b32 v21, s57, 3 -; GFX9-NEXT: s_pack_ll_b32_b16 s75, s13, s57 +; GFX9-NEXT: s_lshr_b32 s13, s13, 16 ; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff ; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -171651,7 +170983,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s12, s12, 16 ; GFX9-NEXT: s_add_i32 s47, s47, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s74, s12, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s90, s12, s56 ; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff ; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -171660,7 +170992,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_lshl_b32 s11, s11, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s11, v1 ; GFX9-NEXT: v_readfirstlane_b32 s11, v2 -; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_lshr_b32 s79, s46, 16 ; GFX9-NEXT: s_bfe_u32 s46, s11, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s11 ; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff @@ -171672,10 +171004,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 ; GFX9-NEXT: v_readfirstlane_b32 s46, v2 ; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 -; GFX9-NEXT: s_lshr_b32 s11, s11, 16 ; GFX9-NEXT: s_add_i32 s47, s47, s46 -; GFX9-NEXT: v_writelane_b32 v21, s57, 4 -; GFX9-NEXT: s_pack_ll_b32_b16 s63, s11, s57 +; GFX9-NEXT: s_lshr_b32 s11, s11, 16 ; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff ; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -171698,7 +171028,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s10, s10, 16 ; GFX9-NEXT: s_add_i32 s47, s47, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s62, s10, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s88, s10, s56 ; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff ; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -171707,7 +171037,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_lshl_b32 s9, s9, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s9, v1 ; GFX9-NEXT: v_readfirstlane_b32 s9, v2 -; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_lshr_b32 s91, s46, 16 ; GFX9-NEXT: s_bfe_u32 s46, s9, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s9 ; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff @@ -171719,10 +171049,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 ; GFX9-NEXT: v_readfirstlane_b32 s46, v2 ; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 -; GFX9-NEXT: s_lshr_b32 s9, s9, 16 ; GFX9-NEXT: s_add_i32 s47, s47, s46 -; GFX9-NEXT: v_writelane_b32 v21, s57, 5 -; GFX9-NEXT: s_pack_ll_b32_b16 s61, s9, s57 +; GFX9-NEXT: s_lshr_b32 s9, s9, 16 ; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff ; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -171745,7 +171073,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s8, s8, 16 ; GFX9-NEXT: s_add_i32 s47, s47, s46 -; GFX9-NEXT: s_pack_ll_b32_b16 s60, s8, s56 +; GFX9-NEXT: s_pack_ll_b32_b16 s78, s8, s56 ; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff ; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 @@ -171754,7 +171082,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s7, v1 ; GFX9-NEXT: v_readfirstlane_b32 s7, v2 -; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_lshr_b32 s95, s46, 16 ; GFX9-NEXT: s_bfe_u32 s46, s7, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s7 ; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff @@ -171769,632 +171097,655 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_add_i32 s47, s47, s46 ; GFX9-NEXT: s_lshr_b32 s7, s7, 16 ; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff -; GFX9-NEXT: s_or_b32 s58, s46, 0x400000 +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec -; GFX9-NEXT: s_cselect_b32 s46, s58, s56 +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s6, v1 ; GFX9-NEXT: v_readfirstlane_b32 s6, v2 ; GFX9-NEXT: s_lshr_b32 s56, s46, 16 ; GFX9-NEXT: s_bfe_u32 s46, s6, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s6 -; GFX9-NEXT: s_add_i32 s58, s46, 0x7fff +; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s6, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec -; GFX9-NEXT: s_cselect_b32 s6, s6, s58 +; GFX9-NEXT: s_cselect_b32 s6, s6, s57 ; GFX9-NEXT: s_and_b32 s46, s5, 0xffff0000 ; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 ; GFX9-NEXT: v_readfirstlane_b32 s46, v2 ; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 -; GFX9-NEXT: s_add_i32 s47, s47, s46 ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: s_add_i32 s58, s47, 0x7fff -; GFX9-NEXT: s_or_b32 s59, s46, 0x400000 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s72, s6, s56 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec -; GFX9-NEXT: s_cselect_b32 s46, s59, s58 +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: s_lshr_b32 s59, s46, 16 +; GFX9-NEXT: s_lshr_b32 s31, s46, 16 ; GFX9-NEXT: s_bfe_u32 s46, s5, 0x10010 ; GFX9-NEXT: s_add_i32 s46, s46, s5 -; GFX9-NEXT: s_add_i32 s58, s46, 0x7fff +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s5, 22 ; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec -; GFX9-NEXT: s_cselect_b32 s5, s5, s58 +; GFX9-NEXT: s_cselect_b32 s5, s5, s56 ; GFX9-NEXT: s_and_b32 s46, s4, 0xffff0000 ; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 ; GFX9-NEXT: v_readfirstlane_b32 s46, v2 -; GFX9-NEXT: s_bfe_u32 s58, s46, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s46 -; GFX9-NEXT: v_writelane_b32 v21, s57, 6 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_add_i32 s47, s47, s46 ; GFX9-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_writelane_b32 v21, s59, 7 -; GFX9-NEXT: s_pack_ll_b32_b16 s47, s5, s59 -; GFX9-NEXT: s_bitset1_b32 s46, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s46, s46, s72 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v2 -; GFX9-NEXT: s_bfe_u32 s58, s4, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s4 -; GFX9-NEXT: s_lshr_b32 s46, s46, 16 -; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s56, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s4, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s4 +; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s4, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s4, s4, s72 -; GFX9-NEXT: s_and_b32 s58, s45, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 -; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s4, s4, s57 +; GFX9-NEXT: s_and_b32 s46, s45, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s60, s4, s56 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s45, s45, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s45, v1 ; GFX9-NEXT: v_readfirstlane_b32 s45, v2 -; GFX9-NEXT: s_lshr_b32 s73, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s45, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s45 -; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s37, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s45, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s45 +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s45, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s45, s45, s72 -; GFX9-NEXT: s_and_b32 s58, s44, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s45, s45, s56 +; GFX9-NEXT: s_and_b32 s46, s44, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_add_i32 s47, s47, s46 ; GFX9-NEXT: s_lshr_b32 s45, s45, 16 -; GFX9-NEXT: s_add_i32 s59, s59, s58 -; GFX9-NEXT: v_writelane_b32 v21, s73, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s37, s45, s73 -; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s44, s44, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s44, v1 ; GFX9-NEXT: v_readfirstlane_b32 s44, v2 -; GFX9-NEXT: s_lshr_b32 s72, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s44, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s44 -; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s56, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s44, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s44 +; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s44, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s44, s44, s73 -; GFX9-NEXT: s_and_b32 s58, s43, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s44, s44, s57 +; GFX9-NEXT: s_and_b32 s46, s43, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s44, s44, 16 -; GFX9-NEXT: s_add_i32 s59, s59, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s36, s44, s72 -; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s58, s44, s56 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s43, s43, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s43, v1 ; GFX9-NEXT: v_readfirstlane_b32 s43, v2 -; GFX9-NEXT: s_lshr_b32 s73, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s43, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s43 -; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s39, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s43, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s43 +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s43, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s43, s43, s72 -; GFX9-NEXT: s_and_b32 s58, s42, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s43, s43, s56 +; GFX9-NEXT: s_and_b32 s46, s42, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_add_i32 s47, s47, s46 ; GFX9-NEXT: s_lshr_b32 s43, s43, 16 -; GFX9-NEXT: s_add_i32 s59, s59, s58 -; GFX9-NEXT: v_writelane_b32 v21, s73, 9 -; GFX9-NEXT: s_pack_ll_b32_b16 s35, s43, s73 -; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s42, s42, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s42, v1 ; GFX9-NEXT: v_readfirstlane_b32 s42, v2 -; GFX9-NEXT: s_lshr_b32 s72, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s42, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s42 -; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s56, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s42, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s42 +; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s42, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s42, s42, s73 -; GFX9-NEXT: s_and_b32 s58, s29, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s42, s42, s57 +; GFX9-NEXT: s_and_b32 s46, s29, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s42, s42, 16 -; GFX9-NEXT: s_add_i32 s59, s59, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s34, s42, s72 -; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s38, s42, s56 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s29, s29, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s29, v1 ; GFX9-NEXT: v_readfirstlane_b32 s29, v2 -; GFX9-NEXT: s_lshr_b32 s73, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s29, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s29 -; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s35, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s29, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s29 +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s29, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s29, s29, s72 -; GFX9-NEXT: s_and_b32 s58, s28, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s29, s29, s56 +; GFX9-NEXT: s_and_b32 s46, s28, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_add_i32 s47, s47, s46 ; GFX9-NEXT: s_lshr_b32 s29, s29, 16 -; GFX9-NEXT: s_add_i32 s59, s59, s58 -; GFX9-NEXT: v_writelane_b32 v21, s73, 10 -; GFX9-NEXT: s_pack_ll_b32_b16 s31, s29, s73 -; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s28, s28, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s28, v1 ; GFX9-NEXT: v_readfirstlane_b32 s28, v2 -; GFX9-NEXT: s_lshr_b32 s72, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s28, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s28 -; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s56, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s28, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s28 +; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s28, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s28, s28, s73 -; GFX9-NEXT: s_and_b32 s58, s27, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s28, s28, s57 +; GFX9-NEXT: s_and_b32 s46, s27, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s28, s28, 16 -; GFX9-NEXT: s_add_i32 s59, s59, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s30, s28, s72 -; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s34, s28, s56 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s27, s27, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s27, v1 ; GFX9-NEXT: v_readfirstlane_b32 s27, v2 -; GFX9-NEXT: s_lshr_b32 s73, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s27, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s27 -; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s27, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s27 +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s27, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s27, s27, s72 -; GFX9-NEXT: s_and_b32 s58, s26, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s27, s27, s56 +; GFX9-NEXT: s_and_b32 s46, s26, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s27, s27, 16 -; GFX9-NEXT: s_add_i32 s59, s59, s58 -; GFX9-NEXT: v_writelane_b32 v21, s73, 11 -; GFX9-NEXT: s_pack_ll_b32_b16 s95, s27, s73 -; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: v_writelane_b32 v21, s57, 7 +; GFX9-NEXT: s_pack_ll_b32_b16 s93, s27, s57 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s26, s26, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s26, v1 ; GFX9-NEXT: v_readfirstlane_b32 s26, v2 -; GFX9-NEXT: s_lshr_b32 s72, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s26, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s26 -; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s56, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s26, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s26 +; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s26, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s26, s26, s73 -; GFX9-NEXT: s_and_b32 s58, s25, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s26, s26, s57 +; GFX9-NEXT: s_and_b32 s46, s25, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s26, s26, 16 -; GFX9-NEXT: s_add_i32 s59, s59, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s94, s26, s72 -; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s92, s26, s56 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s25, s25, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s25, v1 ; GFX9-NEXT: v_readfirstlane_b32 s25, v2 -; GFX9-NEXT: s_lshr_b32 s73, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s25, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s25 -; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s25, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s25 +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s25, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s25, s25, s72 -; GFX9-NEXT: s_and_b32 s58, s24, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s25, s25, s56 +; GFX9-NEXT: s_and_b32 s46, s24, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s25, s25, 16 -; GFX9-NEXT: s_add_i32 s59, s59, s58 -; GFX9-NEXT: v_writelane_b32 v21, s73, 12 -; GFX9-NEXT: s_pack_ll_b32_b16 s91, s25, s73 -; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: v_writelane_b32 v21, s57, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s49, s25, s57 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s24, s24, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s24, v1 ; GFX9-NEXT: v_readfirstlane_b32 s24, v2 -; GFX9-NEXT: s_lshr_b32 s72, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s24, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s24 -; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s56, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s24, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s24 +; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s24, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s24, s24, s73 -; GFX9-NEXT: s_and_b32 s58, s23, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s24, s24, s57 +; GFX9-NEXT: s_and_b32 s46, s23, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s24, s24, 16 -; GFX9-NEXT: s_add_i32 s59, s59, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s90, s24, s72 -; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s48, s24, s56 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s23, s23, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s23, v1 ; GFX9-NEXT: v_readfirstlane_b32 s23, v2 -; GFX9-NEXT: s_lshr_b32 s73, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s23, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s23 -; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s57, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s23, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s23 +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s23, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s23, s23, s72 -; GFX9-NEXT: s_and_b32 s58, s22, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s23, s23, s56 +; GFX9-NEXT: s_and_b32 s46, s22, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s23, s23, 16 -; GFX9-NEXT: s_add_i32 s59, s59, s58 -; GFX9-NEXT: v_writelane_b32 v21, s73, 13 -; GFX9-NEXT: s_pack_ll_b32_b16 s77, s23, s73 -; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: v_writelane_b32 v21, s57, 9 +; GFX9-NEXT: s_pack_ll_b32_b16 s51, s23, s57 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s22, s22, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s22, v1 ; GFX9-NEXT: v_readfirstlane_b32 s22, v2 -; GFX9-NEXT: s_lshr_b32 s72, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s22, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s22 -; GFX9-NEXT: s_add_i32 s73, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s56, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s22, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s22 +; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s22, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s22, s22, s73 -; GFX9-NEXT: s_and_b32 s58, s21, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s22, s22, s57 +; GFX9-NEXT: s_and_b32 s46, s21, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s22, s22, 16 -; GFX9-NEXT: s_add_i32 s59, s59, s58 -; GFX9-NEXT: s_pack_ll_b32_b16 s76, s22, s72 -; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s73, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s50, s22, s56 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s73, s72 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s21, s21, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s21, v1 ; GFX9-NEXT: v_readfirstlane_b32 s21, v2 -; GFX9-NEXT: s_lshr_b32 s68, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s21, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s21 -; GFX9-NEXT: s_add_i32 s72, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s84, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s21, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s21 +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s21, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s21, s21, s72 -; GFX9-NEXT: s_and_b32 s58, s20, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 -; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s21, s21, s56 +; GFX9-NEXT: s_and_b32 s46, s20, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 +; GFX9-NEXT: s_add_i32 s47, s47, s46 ; GFX9-NEXT: s_lshr_b32 s21, s21, 16 -; GFX9-NEXT: s_add_i32 s72, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s38, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s38, s72 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s20, s20, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s20, v1 ; GFX9-NEXT: v_readfirstlane_b32 s20, v2 -; GFX9-NEXT: s_lshr_b32 s72, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s20, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s20 -; GFX9-NEXT: s_add_i32 s38, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s56, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s20, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s20 +; GFX9-NEXT: s_add_i32 s57, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s20, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s20, s20, s38 -; GFX9-NEXT: s_and_b32 s58, s19, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 -; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s20, s20, s57 +; GFX9-NEXT: s_and_b32 s46, s19, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s47, s46, 0x10010 ; GFX9-NEXT: s_lshr_b32 s20, s20, 16 -; GFX9-NEXT: s_add_i32 s38, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s39, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s47, s47, s46 +; GFX9-NEXT: s_pack_ll_b32_b16 s52, s20, s56 +; GFX9-NEXT: s_add_i32 s56, s47, 0x7fff +; GFX9-NEXT: s_or_b32 s57, s46, 0x400000 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s39, s38 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s57, s56 ; GFX9-NEXT: s_lshl_b32 s19, s19, 16 ; GFX9-NEXT: v_add_f32_e32 v2, s19, v1 ; GFX9-NEXT: v_readfirstlane_b32 s19, v2 -; GFX9-NEXT: s_lshr_b32 s69, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s19, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s19 -; GFX9-NEXT: s_add_i32 s38, s58, 0x7fff +; GFX9-NEXT: s_lshr_b32 s85, s46, 16 +; GFX9-NEXT: s_bfe_u32 s46, s19, 0x10010 +; GFX9-NEXT: s_add_i32 s46, s46, s19 +; GFX9-NEXT: s_add_i32 s56, s46, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: s_bitset1_b32 s19, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s19, s19, s38 -; GFX9-NEXT: s_and_b32 s58, s18, 0xffff0000 -; GFX9-NEXT: v_add_f32_e32 v2, s58, v1 -; GFX9-NEXT: v_readfirstlane_b32 s58, v2 -; GFX9-NEXT: s_bfe_u32 s59, s58, 0x10010 -; GFX9-NEXT: s_add_i32 s59, s59, s58 +; GFX9-NEXT: s_and_b64 s[46:47], vcc, exec +; GFX9-NEXT: s_cselect_b32 s19, s19, s56 +; GFX9-NEXT: s_and_b32 s46, s18, 0xffff0000 +; GFX9-NEXT: v_add_f32_e32 v2, s46, v1 +; GFX9-NEXT: v_readfirstlane_b32 s46, v2 +; GFX9-NEXT: s_bfe_u32 s56, s46, 0x10010 +; GFX9-NEXT: s_add_i32 s56, s56, s46 ; GFX9-NEXT: s_lshr_b32 s19, s19, 16 -; GFX9-NEXT: s_add_i32 s38, s59, 0x7fff -; GFX9-NEXT: s_or_b32 s39, s58, 0x400000 +; GFX9-NEXT: s_add_i32 s59, s56, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_cselect_b32 s58, s39, s38 +; GFX9-NEXT: s_bitset1_b32 s46, 22 +; GFX9-NEXT: s_and_b64 s[56:57], vcc, exec +; GFX9-NEXT: s_cselect_b32 s46, s46, s59 ; GFX9-NEXT: s_lshl_b32 s18, s18, 16 ; GFX9-NEXT: v_add_f32_e32 v1, s18, v1 ; GFX9-NEXT: v_readfirstlane_b32 s18, v1 -; GFX9-NEXT: s_lshr_b32 s38, s58, 16 -; GFX9-NEXT: s_bfe_u32 s58, s18, 0x10010 -; GFX9-NEXT: s_add_i32 s58, s58, s18 -; GFX9-NEXT: s_add_i32 s39, s58, 0x7fff +; GFX9-NEXT: s_bfe_u32 s56, s18, 0x10010 +; GFX9-NEXT: s_add_i32 s56, s56, s18 +; GFX9-NEXT: s_lshr_b32 s46, s46, 16 +; GFX9-NEXT: s_add_i32 s59, s56, 0x7fff ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_bitset1_b32 s18, 22 -; GFX9-NEXT: s_and_b64 s[58:59], vcc, exec -; GFX9-NEXT: s_pack_ll_b32_b16 s59, s19, s69 -; GFX9-NEXT: s_cselect_b32 s18, s18, s39 +; GFX9-NEXT: s_and_b64 s[56:57], vcc, exec +; GFX9-NEXT: s_cselect_b32 s18, s18, s59 ; GFX9-NEXT: s_lshr_b32 s18, s18, 16 -; GFX9-NEXT: s_lshr_b32 vcc_lo, s59, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s58, s18, s38 -; GFX9-NEXT: v_writelane_b32 v21, vcc_lo, 14 -; GFX9-NEXT: s_lshr_b32 vcc_lo, s59, 8 -; GFX9-NEXT: v_writelane_b32 v21, vcc_lo, 15 -; GFX9-NEXT: s_lshr_b32 vcc_lo, s58, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s73, s21, s68 -; GFX9-NEXT: v_writelane_b32 v21, vcc_lo, 16 -; GFX9-NEXT: s_lshr_b32 vcc_lo, s58, 8 -; GFX9-NEXT: s_lshr_b64 s[58:59], s[58:59], 24 -; GFX9-NEXT: v_writelane_b32 v21, vcc_lo, 17 -; GFX9-NEXT: s_lshr_b32 s59, s73, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s72, s20, s72 -; GFX9-NEXT: v_writelane_b32 v21, s59, 18 -; GFX9-NEXT: s_lshr_b32 s59, s73, 8 -; GFX9-NEXT: v_writelane_b32 v21, s59, 19 -; GFX9-NEXT: s_lshr_b32 s59, s72, 16 -; GFX9-NEXT: v_writelane_b32 v21, s59, 20 -; GFX9-NEXT: s_lshr_b32 s59, s72, 8 -; GFX9-NEXT: v_writelane_b32 v21, s59, 21 -; GFX9-NEXT: s_lshr_b32 s59, s77, 24 -; GFX9-NEXT: v_writelane_b32 v21, s59, 22 -; GFX9-NEXT: s_lshr_b32 s59, s77, 8 -; GFX9-NEXT: v_writelane_b32 v21, s59, 23 -; GFX9-NEXT: s_lshr_b32 s59, s76, 16 -; GFX9-NEXT: v_writelane_b32 v21, s59, 24 -; GFX9-NEXT: s_lshr_b32 s59, s76, 8 -; GFX9-NEXT: v_writelane_b32 v21, s59, 25 -; GFX9-NEXT: s_lshr_b32 s59, s91, 24 -; GFX9-NEXT: v_writelane_b32 v21, s59, 26 -; GFX9-NEXT: s_lshr_b32 s59, s91, 8 -; GFX9-NEXT: v_writelane_b32 v21, s59, 27 -; GFX9-NEXT: s_lshr_b32 s59, s90, 16 -; GFX9-NEXT: v_writelane_b32 v21, s59, 28 -; GFX9-NEXT: s_lshr_b32 s59, s90, 8 -; GFX9-NEXT: v_writelane_b32 v21, s59, 29 -; GFX9-NEXT: s_lshr_b32 s59, s95, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s46, s4, s46 -; GFX9-NEXT: s_lshr_b64 s[72:73], s[72:73], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[76:77], 24 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[90:91], 24 -; GFX9-NEXT: v_writelane_b32 v21, s59, 30 -; GFX9-NEXT: s_lshr_b32 s59, s95, 8 -; GFX9-NEXT: v_writelane_b32 v21, s59, 31 -; GFX9-NEXT: s_lshr_b32 s59, s47, 24 -; GFX9-NEXT: s_lshr_b32 s73, s47, 8 -; GFX9-NEXT: s_lshr_b32 s77, s46, 16 -; GFX9-NEXT: s_lshr_b32 s91, s46, 8 -; GFX9-NEXT: s_lshr_b64 s[46:47], s[46:47], 24 -; GFX9-NEXT: s_lshr_b32 s47, s61, 24 -; GFX9-NEXT: v_writelane_b32 v21, s47, 32 -; GFX9-NEXT: s_lshr_b32 s47, s61, 8 -; GFX9-NEXT: v_writelane_b32 v21, s47, 33 -; GFX9-NEXT: s_lshr_b32 s47, s60, 16 -; GFX9-NEXT: v_writelane_b32 v21, s47, 34 -; GFX9-NEXT: s_lshr_b32 s47, s60, 8 -; GFX9-NEXT: v_writelane_b32 v21, s47, 35 -; GFX9-NEXT: s_lshr_b32 s47, s63, 8 -; GFX9-NEXT: s_lshr_b64 s[60:61], s[60:61], 24 -; GFX9-NEXT: v_writelane_b32 v21, s47, 36 -; GFX9-NEXT: s_lshr_b32 s47, s62, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s57, s7, s57 -; GFX9-NEXT: s_pack_ll_b32_b16 s56, s6, s56 -; GFX9-NEXT: s_lshr_b32 s38, s94, 16 -; GFX9-NEXT: s_lshr_b32 s48, s94, 8 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[94:95], 24 -; GFX9-NEXT: s_lshr_b32 s50, s31, 24 -; GFX9-NEXT: s_lshr_b32 s51, s31, 8 -; GFX9-NEXT: s_lshr_b32 s53, s30, 16 -; GFX9-NEXT: s_lshr_b32 s71, s30, 8 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[30:31], 24 -; GFX9-NEXT: s_lshr_b32 s80, s35, 24 -; GFX9-NEXT: s_lshr_b32 s82, s35, 8 -; GFX9-NEXT: s_lshr_b32 s65, s34, 16 -; GFX9-NEXT: s_lshr_b32 s85, s34, 8 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], 24 -; GFX9-NEXT: s_lshr_b32 s66, s37, 24 -; GFX9-NEXT: s_lshr_b32 s97, s37, 8 -; GFX9-NEXT: s_lshr_b32 s99, s36, 16 -; GFX9-NEXT: s_lshr_b32 s67, s36, 8 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[36:37], 24 -; GFX9-NEXT: s_lshr_b32 s61, s63, 24 -; GFX9-NEXT: v_writelane_b32 v21, s47, 37 -; GFX9-NEXT: s_lshr_b32 s47, s62, 8 -; GFX9-NEXT: s_lshr_b64 s[62:63], s[62:63], 24 -; GFX9-NEXT: s_lshr_b32 s95, s57, 24 -; GFX9-NEXT: s_lshr_b32 s31, s57, 8 -; GFX9-NEXT: s_lshr_b32 s35, s56, 16 -; GFX9-NEXT: s_lshr_b32 s37, s56, 8 -; GFX9-NEXT: s_lshr_b64 s[56:57], s[56:57], 24 -; GFX9-NEXT: v_writelane_b32 v21, s47, 38 -; GFX9-NEXT: s_lshr_b32 s63, s75, 24 -; GFX9-NEXT: s_lshr_b32 s47, s75, 8 -; GFX9-NEXT: s_lshr_b32 s54, s74, 16 -; GFX9-NEXT: s_lshr_b32 s39, s74, 8 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[74:75], 24 -; GFX9-NEXT: s_lshr_b32 s49, s79, 24 -; GFX9-NEXT: s_lshr_b32 s55, s79, 8 -; GFX9-NEXT: s_lshr_b32 s52, s78, 16 -; GFX9-NEXT: s_lshr_b32 s70, s78, 8 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[78:79], 24 -; GFX9-NEXT: s_lshr_b32 s64, s93, 24 -; GFX9-NEXT: s_lshr_b32 s81, s93, 8 -; GFX9-NEXT: s_lshr_b32 s83, s92, 16 -; GFX9-NEXT: s_lshr_b32 s84, s92, 8 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[92:93], 24 -; GFX9-NEXT: s_lshr_b32 s86, s89, 24 -; GFX9-NEXT: s_lshr_b32 s87, s89, 8 -; GFX9-NEXT: s_lshr_b32 s96, s88, 16 -; GFX9-NEXT: s_lshr_b32 s98, s88, 8 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[88:89], 24 -; GFX9-NEXT: v_writelane_b32 v21, s47, 39 +; GFX9-NEXT: s_pack_ll_b32_b16 s47, s19, s85 +; GFX9-NEXT: s_pack_ll_b32_b16 s46, s18, s46 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[46:47], 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s53, s21, s84 +; GFX9-NEXT: v_writelane_b32 v21, s56, 2 +; GFX9-NEXT: v_writelane_b32 v21, s57, 3 +; GFX9-NEXT: s_lshr_b64 s[56:57], s[52:53], 24 +; GFX9-NEXT: v_writelane_b32 v21, s56, 0 +; GFX9-NEXT: v_writelane_b32 v21, s57, 1 +; GFX9-NEXT: s_lshr_b32 s56, s47, 24 +; GFX9-NEXT: v_writelane_b32 v21, s56, 10 +; GFX9-NEXT: s_lshr_b32 s47, s47, 8 +; GFX9-NEXT: v_writelane_b32 v21, s47, 11 +; GFX9-NEXT: s_lshr_b32 s47, s46, 16 +; GFX9-NEXT: v_writelane_b32 v21, s47, 12 +; GFX9-NEXT: s_lshr_b32 s46, s46, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 13 +; GFX9-NEXT: s_lshr_b32 s46, s53, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 14 +; GFX9-NEXT: s_lshr_b32 s46, s53, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 15 +; GFX9-NEXT: s_lshr_b32 s46, s52, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 16 +; GFX9-NEXT: s_lshr_b32 s46, s52, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 17 +; GFX9-NEXT: s_lshr_b32 s46, s51, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 18 +; GFX9-NEXT: s_lshr_b32 s46, s51, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 19 +; GFX9-NEXT: s_lshr_b32 s46, s50, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 20 +; GFX9-NEXT: s_lshr_b32 s46, s50, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 21 +; GFX9-NEXT: s_lshr_b32 s46, s49, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 22 +; GFX9-NEXT: s_lshr_b32 s46, s49, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 23 +; GFX9-NEXT: s_lshr_b32 s46, s48, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 24 +; GFX9-NEXT: s_lshr_b32 s46, s48, 8 +; GFX9-NEXT: s_lshr_b64 s[62:63], s[50:51], 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 25 +; GFX9-NEXT: s_lshr_b32 s46, s93, 24 +; GFX9-NEXT: s_mov_b32 s63, s61 +; GFX9-NEXT: v_writelane_b32 v21, s46, 26 +; GFX9-NEXT: s_lshr_b32 s46, s93, 8 +; GFX9-NEXT: s_mov_b32 s70, s35 +; GFX9-NEXT: s_pack_ll_b32_b16 s35, s29, s35 +; GFX9-NEXT: s_mov_b32 s54, s39 +; GFX9-NEXT: s_pack_ll_b32_b16 s39, s43, s39 +; GFX9-NEXT: s_pack_ll_b32_b16 s59, s45, s37 +; GFX9-NEXT: s_pack_ll_b32_b16 s61, s5, s31 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[48:49], 24 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[92:93], 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 27 +; GFX9-NEXT: s_lshr_b32 s55, s92, 16 +; GFX9-NEXT: s_lshr_b32 s64, s92, 8 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[34:35], 24 +; GFX9-NEXT: s_lshr_b32 s71, s35, 24 +; GFX9-NEXT: s_lshr_b32 s68, s35, 8 +; GFX9-NEXT: s_lshr_b32 s86, s34, 16 +; GFX9-NEXT: s_lshr_b32 s87, s34, 8 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[38:39], 24 +; GFX9-NEXT: s_lshr_b32 s97, s39, 24 +; GFX9-NEXT: s_lshr_b32 s48, s39, 8 +; GFX9-NEXT: s_lshr_b32 s53, s38, 16 +; GFX9-NEXT: s_lshr_b32 s47, s38, 8 +; GFX9-NEXT: s_lshr_b64 s[38:39], s[58:59], 24 +; GFX9-NEXT: s_lshr_b32 s99, s59, 24 +; GFX9-NEXT: s_lshr_b32 s50, s59, 8 +; GFX9-NEXT: s_lshr_b32 s83, s58, 16 +; GFX9-NEXT: s_lshr_b32 s57, s58, 8 +; GFX9-NEXT: s_lshr_b64 s[58:59], s[60:61], 24 +; GFX9-NEXT: v_writelane_b32 v21, s58, 4 +; GFX9-NEXT: v_writelane_b32 v21, s59, 5 +; GFX9-NEXT: s_lshr_b32 s46, s61, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 28 +; GFX9-NEXT: s_lshr_b32 s46, s61, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 29 +; GFX9-NEXT: s_lshr_b32 s46, s60, 16 +; GFX9-NEXT: s_mov_b32 s75, s73 +; GFX9-NEXT: v_writelane_b32 v21, s46, 30 +; GFX9-NEXT: s_lshr_b32 s46, s60, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s73, s7, s95 +; GFX9-NEXT: v_writelane_b32 v21, s46, 31 +; GFX9-NEXT: s_lshr_b32 s46, s73, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 32 +; GFX9-NEXT: s_lshr_b32 s46, s73, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 33 +; GFX9-NEXT: s_lshr_b32 s46, s72, 16 +; GFX9-NEXT: s_mov_b32 s77, s79 +; GFX9-NEXT: v_writelane_b32 v21, s46, 34 +; GFX9-NEXT: s_lshr_b32 s46, s72, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s79, s9, s91 +; GFX9-NEXT: v_writelane_b32 v21, s46, 35 +; GFX9-NEXT: s_lshr_b32 s46, s79, 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 36 +; GFX9-NEXT: s_lshr_b32 s46, s79, 8 +; GFX9-NEXT: s_lshr_b64 s[60:61], s[72:73], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[78:79], 24 +; GFX9-NEXT: v_writelane_b32 v21, s46, 37 +; GFX9-NEXT: s_lshr_b32 s46, s78, 16 +; GFX9-NEXT: s_mov_b32 s73, s89 +; GFX9-NEXT: v_writelane_b32 v21, s46, 38 +; GFX9-NEXT: s_lshr_b32 s46, s78, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s89, s11, s77 +; GFX9-NEXT: v_writelane_b32 v21, s46, 39 +; GFX9-NEXT: s_lshr_b32 s46, s89, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 40 +; GFX9-NEXT: s_lshr_b32 s46, s88, 16 +; GFX9-NEXT: v_writelane_b32 v21, s46, 41 +; GFX9-NEXT: s_lshr_b32 s46, s88, 8 +; GFX9-NEXT: v_writelane_b32 v21, s46, 42 +; GFX9-NEXT: v_readlane_b32 s61, v21, 6 +; GFX9-NEXT: s_mov_b32 s93, s91 +; GFX9-NEXT: s_mov_b32 s35, s95 +; GFX9-NEXT: s_mov_b32 s51, s37 +; GFX9-NEXT: s_mov_b32 s39, s31 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[88:89], 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s91, s13, s75 +; GFX9-NEXT: s_pack_ll_b32_b16 s95, s15, s73 +; GFX9-NEXT: s_pack_ll_b32_b16 s31, s17, s63 +; GFX9-NEXT: s_pack_ll_b32_b16 s37, s41, s61 +; GFX9-NEXT: s_lshr_b32 s79, s89, 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[90:91], 24 +; GFX9-NEXT: s_lshr_b32 s65, s91, 24 +; GFX9-NEXT: s_lshr_b32 s66, s91, 8 +; GFX9-NEXT: s_lshr_b32 s67, s90, 16 +; GFX9-NEXT: s_lshr_b32 s69, s90, 8 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[94:95], 24 +; GFX9-NEXT: s_lshr_b32 s80, s95, 24 +; GFX9-NEXT: s_lshr_b32 s96, s95, 8 +; GFX9-NEXT: s_lshr_b32 s98, s94, 16 +; GFX9-NEXT: s_lshr_b32 s81, s94, 8 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[30:31], 24 +; GFX9-NEXT: s_lshr_b32 s46, s31, 24 +; GFX9-NEXT: s_lshr_b32 s49, s31, 8 +; GFX9-NEXT: s_lshr_b32 s52, s30, 16 +; GFX9-NEXT: s_lshr_b32 s82, s30, 8 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[36:37], 24 +; GFX9-NEXT: s_lshr_b32 s56, s37, 24 +; GFX9-NEXT: s_lshr_b32 s37, s37, 8 +; GFX9-NEXT: s_lshr_b32 s58, s36, 16 +; GFX9-NEXT: s_lshr_b32 s59, s36, 8 +; GFX9-NEXT: s_mov_b32 s36, s61 ; GFX9-NEXT: .LBB91_3: ; %end -; GFX9-NEXT: s_lshl_b32 s47, s67, 8 ; GFX9-NEXT: s_and_b32 s44, s44, 0xff -; GFX9-NEXT: s_or_b32 s44, s44, s47 -; GFX9-NEXT: s_lshl_b32 s47, s36, 8 -; GFX9-NEXT: s_and_b32 s57, s99, 0xff -; GFX9-NEXT: s_or_b32 s47, s57, s47 +; GFX9-NEXT: s_lshl_b32 s57, s57, 8 +; GFX9-NEXT: s_or_b32 s44, s44, s57 +; GFX9-NEXT: s_and_b32 s57, s83, 0xff +; GFX9-NEXT: s_lshl_b32 s61, s38, 8 +; GFX9-NEXT: s_or_b32 s57, s57, s61 ; GFX9-NEXT: s_and_b32 s44, s44, 0xffff -; GFX9-NEXT: s_lshl_b32 s47, s47, 16 -; GFX9-NEXT: s_or_b32 s44, s44, s47 +; GFX9-NEXT: s_lshl_b32 s57, s57, 16 +; GFX9-NEXT: s_or_b32 s44, s44, s57 ; GFX9-NEXT: v_mov_b32_e32 v1, s44 ; GFX9-NEXT: s_and_b32 s44, s45, 0xff -; GFX9-NEXT: s_lshl_b32 s45, s97, 8 +; GFX9-NEXT: s_lshl_b32 s45, s50, 8 ; GFX9-NEXT: s_or_b32 s44, s44, s45 -; GFX9-NEXT: v_readlane_b32 s45, v21, 8 -; GFX9-NEXT: s_and_b32 s45, s45, 0xff -; GFX9-NEXT: s_lshl_b32 s47, s66, 8 -; GFX9-NEXT: s_or_b32 s45, s45, s47 +; GFX9-NEXT: s_and_b32 s45, s51, 0xff +; GFX9-NEXT: s_lshl_b32 s57, s99, 8 +; GFX9-NEXT: s_or_b32 s45, s45, s57 ; GFX9-NEXT: s_and_b32 s44, s44, 0xffff ; GFX9-NEXT: s_lshl_b32 s45, s45, 16 ; GFX9-NEXT: s_or_b32 s44, s44, s45 ; GFX9-NEXT: v_mov_b32_e32 v2, s44 -; GFX9-NEXT: s_lshl_b32 s44, s85, 8 ; GFX9-NEXT: s_and_b32 s42, s42, 0xff +; GFX9-NEXT: s_lshl_b32 s44, s47, 8 ; GFX9-NEXT: s_or_b32 s42, s42, s44 -; GFX9-NEXT: s_lshl_b32 s44, s34, 8 -; GFX9-NEXT: s_and_b32 s45, s65, 0xff -; GFX9-NEXT: s_or_b32 s44, s45, s44 +; GFX9-NEXT: s_and_b32 s44, s53, 0xff +; GFX9-NEXT: s_lshl_b32 s45, s34, 8 +; GFX9-NEXT: s_or_b32 s44, s44, s45 ; GFX9-NEXT: s_and_b32 s42, s42, 0xffff ; GFX9-NEXT: s_lshl_b32 s44, s44, 16 ; GFX9-NEXT: s_or_b32 s42, s42, s44 ; GFX9-NEXT: v_mov_b32_e32 v3, s42 ; GFX9-NEXT: s_and_b32 s42, s43, 0xff -; GFX9-NEXT: s_lshl_b32 s43, s82, 8 +; GFX9-NEXT: s_lshl_b32 s43, s48, 8 ; GFX9-NEXT: s_or_b32 s42, s42, s43 -; GFX9-NEXT: v_readlane_b32 s43, v21, 9 -; GFX9-NEXT: s_and_b32 s43, s43, 0xff -; GFX9-NEXT: s_lshl_b32 s44, s80, 8 +; GFX9-NEXT: s_and_b32 s43, s54, 0xff +; GFX9-NEXT: s_lshl_b32 s44, s97, 8 ; GFX9-NEXT: s_or_b32 s43, s43, s44 ; GFX9-NEXT: s_and_b32 s42, s42, 0xffff ; GFX9-NEXT: s_lshl_b32 s43, s43, 16 ; GFX9-NEXT: s_or_b32 s42, s42, s43 ; GFX9-NEXT: v_mov_b32_e32 v4, s42 -; GFX9-NEXT: s_lshl_b32 s42, s71, 8 ; GFX9-NEXT: s_and_b32 s28, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s42, s87, 8 ; GFX9-NEXT: s_or_b32 s28, s28, s42 -; GFX9-NEXT: s_lshl_b32 s42, s30, 8 -; GFX9-NEXT: s_and_b32 s43, s53, 0xff -; GFX9-NEXT: s_or_b32 s42, s43, s42 +; GFX9-NEXT: s_and_b32 s42, s86, 0xff +; GFX9-NEXT: s_lshl_b32 s43, s92, 8 +; GFX9-NEXT: s_or_b32 s42, s42, s43 ; GFX9-NEXT: s_and_b32 s28, s28, 0xffff ; GFX9-NEXT: s_lshl_b32 s42, s42, 16 ; GFX9-NEXT: s_or_b32 s28, s28, s42 ; GFX9-NEXT: v_mov_b32_e32 v5, s28 ; GFX9-NEXT: s_and_b32 s28, s29, 0xff -; GFX9-NEXT: s_lshl_b32 s29, s51, 8 +; GFX9-NEXT: s_lshl_b32 s29, s68, 8 ; GFX9-NEXT: s_or_b32 s28, s28, s29 -; GFX9-NEXT: v_readlane_b32 s29, v21, 10 -; GFX9-NEXT: s_and_b32 s29, s29, 0xff -; GFX9-NEXT: s_lshl_b32 s42, s50, 8 +; GFX9-NEXT: s_and_b32 s29, s70, 0xff +; GFX9-NEXT: s_lshl_b32 s42, s71, 8 ; GFX9-NEXT: s_or_b32 s29, s29, s42 ; GFX9-NEXT: s_and_b32 s28, s28, 0xffff ; GFX9-NEXT: s_lshl_b32 s29, s29, 16 ; GFX9-NEXT: s_or_b32 s28, s28, s29 ; GFX9-NEXT: v_mov_b32_e32 v6, s28 -; GFX9-NEXT: s_lshl_b32 s28, s48, 8 ; GFX9-NEXT: s_and_b32 s26, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s28, s64, 8 ; GFX9-NEXT: s_or_b32 s26, s26, s28 -; GFX9-NEXT: s_lshl_b32 s28, s94, 8 -; GFX9-NEXT: s_and_b32 s29, s38, 0xff -; GFX9-NEXT: s_or_b32 s28, s29, s28 +; GFX9-NEXT: s_and_b32 s28, s55, 0xff +; GFX9-NEXT: s_lshl_b32 s29, s76, 8 +; GFX9-NEXT: s_or_b32 s28, s28, s29 ; GFX9-NEXT: s_and_b32 s26, s26, 0xffff ; GFX9-NEXT: s_lshl_b32 s28, s28, 16 ; GFX9-NEXT: s_or_b32 s26, s26, s28 ; GFX9-NEXT: v_mov_b32_e32 v7, s26 ; GFX9-NEXT: s_and_b32 s26, s27, 0xff -; GFX9-NEXT: v_readlane_b32 s27, v21, 31 +; GFX9-NEXT: v_readlane_b32 s27, v21, 27 ; GFX9-NEXT: s_lshl_b32 s27, s27, 8 ; GFX9-NEXT: s_or_b32 s26, s26, s27 -; GFX9-NEXT: v_readlane_b32 s27, v21, 11 -; GFX9-NEXT: v_readlane_b32 s28, v21, 30 +; GFX9-NEXT: v_readlane_b32 s27, v21, 7 +; GFX9-NEXT: v_readlane_b32 s28, v21, 26 ; GFX9-NEXT: s_and_b32 s27, s27, 0xff ; GFX9-NEXT: s_lshl_b32 s28, s28, 8 ; GFX9-NEXT: s_or_b32 s27, s27, s28 @@ -172402,24 +171753,24 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_lshl_b32 s27, s27, 16 ; GFX9-NEXT: s_or_b32 s26, s26, s27 ; GFX9-NEXT: v_mov_b32_e32 v8, s26 -; GFX9-NEXT: v_readlane_b32 s26, v21, 29 -; GFX9-NEXT: s_lshl_b32 s26, s26, 8 +; GFX9-NEXT: v_readlane_b32 s26, v21, 25 ; GFX9-NEXT: s_and_b32 s24, s24, 0xff -; GFX9-NEXT: v_readlane_b32 s27, v21, 28 +; GFX9-NEXT: s_lshl_b32 s26, s26, 8 ; GFX9-NEXT: s_or_b32 s24, s24, s26 -; GFX9-NEXT: s_lshl_b32 s26, s90, 8 -; GFX9-NEXT: s_and_b32 s27, s27, 0xff -; GFX9-NEXT: s_or_b32 s26, s27, s26 +; GFX9-NEXT: v_readlane_b32 s26, v21, 24 +; GFX9-NEXT: s_and_b32 s26, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s27, s74, 8 +; GFX9-NEXT: s_or_b32 s26, s26, s27 ; GFX9-NEXT: s_and_b32 s24, s24, 0xffff ; GFX9-NEXT: s_lshl_b32 s26, s26, 16 ; GFX9-NEXT: s_or_b32 s24, s24, s26 ; GFX9-NEXT: v_mov_b32_e32 v9, s24 ; GFX9-NEXT: s_and_b32 s24, s25, 0xff -; GFX9-NEXT: v_readlane_b32 s25, v21, 27 +; GFX9-NEXT: v_readlane_b32 s25, v21, 23 ; GFX9-NEXT: s_lshl_b32 s25, s25, 8 ; GFX9-NEXT: s_or_b32 s24, s24, s25 -; GFX9-NEXT: v_readlane_b32 s25, v21, 12 -; GFX9-NEXT: v_readlane_b32 s26, v21, 26 +; GFX9-NEXT: v_readlane_b32 s25, v21, 8 +; GFX9-NEXT: v_readlane_b32 s26, v21, 22 ; GFX9-NEXT: s_and_b32 s25, s25, 0xff ; GFX9-NEXT: s_lshl_b32 s26, s26, 8 ; GFX9-NEXT: s_or_b32 s25, s25, s26 @@ -172427,75 +171778,77 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_lshl_b32 s25, s25, 16 ; GFX9-NEXT: s_or_b32 s24, s24, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-NEXT: v_readlane_b32 s24, v21, 25 -; GFX9-NEXT: s_lshl_b32 s24, s24, 8 +; GFX9-NEXT: v_readlane_b32 s24, v21, 21 ; GFX9-NEXT: s_and_b32 s22, s22, 0xff -; GFX9-NEXT: v_readlane_b32 s25, v21, 24 +; GFX9-NEXT: s_lshl_b32 s24, s24, 8 ; GFX9-NEXT: s_or_b32 s22, s22, s24 -; GFX9-NEXT: s_lshl_b32 s24, s76, 8 -; GFX9-NEXT: s_and_b32 s25, s25, 0xff -; GFX9-NEXT: s_or_b32 s24, s25, s24 +; GFX9-NEXT: v_readlane_b32 s24, v21, 20 +; GFX9-NEXT: s_and_b32 s24, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s25, s62, 8 +; GFX9-NEXT: s_or_b32 s24, s24, s25 ; GFX9-NEXT: s_and_b32 s22, s22, 0xffff ; GFX9-NEXT: s_lshl_b32 s24, s24, 16 ; GFX9-NEXT: s_or_b32 s22, s22, s24 ; GFX9-NEXT: v_mov_b32_e32 v11, s22 ; GFX9-NEXT: s_and_b32 s22, s23, 0xff -; GFX9-NEXT: v_readlane_b32 s23, v21, 23 +; GFX9-NEXT: v_readlane_b32 s23, v21, 19 ; GFX9-NEXT: s_lshl_b32 s23, s23, 8 ; GFX9-NEXT: s_or_b32 s22, s22, s23 -; GFX9-NEXT: v_readlane_b32 s23, v21, 13 -; GFX9-NEXT: v_readlane_b32 s24, v21, 22 +; GFX9-NEXT: v_readlane_b32 s23, v21, 9 +; GFX9-NEXT: v_readlane_b32 s24, v21, 18 ; GFX9-NEXT: s_and_b32 s23, s23, 0xff ; GFX9-NEXT: s_lshl_b32 s24, s24, 8 ; GFX9-NEXT: s_or_b32 s23, s23, s24 ; GFX9-NEXT: s_and_b32 s22, s22, 0xffff ; GFX9-NEXT: s_lshl_b32 s23, s23, 16 ; GFX9-NEXT: s_or_b32 s22, s22, s23 -; GFX9-NEXT: v_mov_b32_e32 v12, s22 -; GFX9-NEXT: v_readlane_b32 s22, v21, 21 -; GFX9-NEXT: s_lshl_b32 s22, s22, 8 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-NEXT: v_readlane_b32 s22, v21, 17 ; GFX9-NEXT: s_and_b32 s20, s20, 0xff -; GFX9-NEXT: v_readlane_b32 s23, v21, 20 +; GFX9-NEXT: s_lshl_b32 s22, s22, 8 ; GFX9-NEXT: s_or_b32 s20, s20, s22 -; GFX9-NEXT: s_lshl_b32 s22, s72, 8 -; GFX9-NEXT: s_and_b32 s23, s23, 0xff -; GFX9-NEXT: s_or_b32 s22, s23, s22 +; GFX9-NEXT: v_readlane_b32 s22, v21, 16 +; GFX9-NEXT: v_readlane_b32 s24, v21, 0 +; GFX9-NEXT: s_and_b32 s22, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s23, s24, 8 +; GFX9-NEXT: s_or_b32 s22, s22, s23 ; GFX9-NEXT: s_and_b32 s20, s20, 0xffff ; GFX9-NEXT: s_lshl_b32 s22, s22, 16 ; GFX9-NEXT: s_or_b32 s20, s20, s22 -; GFX9-NEXT: v_mov_b32_e32 v13, s20 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s20 ; GFX9-NEXT: s_and_b32 s20, s21, 0xff -; GFX9-NEXT: v_readlane_b32 s21, v21, 19 +; GFX9-NEXT: v_readlane_b32 s21, v21, 15 ; GFX9-NEXT: s_lshl_b32 s21, s21, 8 -; GFX9-NEXT: v_readlane_b32 s22, v21, 18 +; GFX9-NEXT: v_readlane_b32 s22, v21, 14 ; GFX9-NEXT: s_or_b32 s20, s20, s21 -; GFX9-NEXT: s_and_b32 s21, s68, 0xff +; GFX9-NEXT: s_and_b32 s21, s84, 0xff ; GFX9-NEXT: s_lshl_b32 s22, s22, 8 ; GFX9-NEXT: s_or_b32 s21, s21, s22 ; GFX9-NEXT: s_and_b32 s20, s20, 0xffff ; GFX9-NEXT: s_lshl_b32 s21, s21, 16 ; GFX9-NEXT: s_or_b32 s20, s20, s21 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 ; GFX9-NEXT: v_mov_b32_e32 v1, s20 -; GFX9-NEXT: v_readlane_b32 s20, v21, 17 +; GFX9-NEXT: v_readlane_b32 s20, v21, 13 ; GFX9-NEXT: s_and_b32 s18, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s20, s20, 8 ; GFX9-NEXT: s_or_b32 s18, s18, s20 -; GFX9-NEXT: v_readlane_b32 s20, v21, 16 +; GFX9-NEXT: v_readlane_b32 s20, v21, 12 +; GFX9-NEXT: v_readlane_b32 s22, v21, 2 ; GFX9-NEXT: s_and_b32 s20, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s21, s58, 8 +; GFX9-NEXT: s_lshl_b32 s21, s22, 8 ; GFX9-NEXT: s_or_b32 s20, s20, s21 ; GFX9-NEXT: s_and_b32 s18, s18, 0xffff ; GFX9-NEXT: s_lshl_b32 s20, s20, 16 @@ -172503,11 +171856,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: s_and_b32 s18, s19, 0xff -; GFX9-NEXT: v_readlane_b32 s19, v21, 15 +; GFX9-NEXT: v_readlane_b32 s19, v21, 11 ; GFX9-NEXT: s_lshl_b32 s19, s19, 8 -; GFX9-NEXT: v_readlane_b32 s20, v21, 14 +; GFX9-NEXT: v_readlane_b32 s20, v21, 10 ; GFX9-NEXT: s_or_b32 s18, s18, s19 -; GFX9-NEXT: s_and_b32 s19, s69, 0xff +; GFX9-NEXT: s_and_b32 s19, s85, 0xff ; GFX9-NEXT: s_lshl_b32 s20, s20, 8 ; GFX9-NEXT: s_or_b32 s19, s19, s20 ; GFX9-NEXT: s_and_b32 s18, s18, 0xffff @@ -172516,10 +171869,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: s_and_b32 s18, s40, 0xff -; GFX9-NEXT: s_lshl_b32 s19, s98, 8 +; GFX9-NEXT: s_lshl_b32 s19, s59, 8 ; GFX9-NEXT: s_or_b32 s18, s18, s19 -; GFX9-NEXT: s_and_b32 s19, s96, 0xff -; GFX9-NEXT: s_lshl_b32 s20, s88, 8 +; GFX9-NEXT: s_and_b32 s19, s58, 0xff +; GFX9-NEXT: s_lshl_b32 s20, s30, 8 ; GFX9-NEXT: s_or_b32 s19, s19, s20 ; GFX9-NEXT: s_and_b32 s18, s18, 0xffff ; GFX9-NEXT: s_lshl_b32 s19, s19, 16 @@ -172527,11 +171880,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: s_and_b32 s18, s41, 0xff -; GFX9-NEXT: s_lshl_b32 s19, s87, 8 +; GFX9-NEXT: s_lshl_b32 s19, s37, 8 ; GFX9-NEXT: s_or_b32 s18, s18, s19 -; GFX9-NEXT: v_readlane_b32 s19, v21, 0 -; GFX9-NEXT: s_and_b32 s19, s19, 0xff -; GFX9-NEXT: s_lshl_b32 s20, s86, 8 +; GFX9-NEXT: s_and_b32 s19, s36, 0xff +; GFX9-NEXT: s_lshl_b32 s20, s56, 8 ; GFX9-NEXT: s_or_b32 s19, s19, s20 ; GFX9-NEXT: s_and_b32 s18, s18, 0xffff ; GFX9-NEXT: s_lshl_b32 s19, s19, 16 @@ -172539,10 +171891,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s84, 8 +; GFX9-NEXT: s_lshl_b32 s18, s82, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s18 -; GFX9-NEXT: s_and_b32 s18, s83, 0xff -; GFX9-NEXT: s_lshl_b32 s19, s92, 8 +; GFX9-NEXT: s_and_b32 s18, s52, 0xff +; GFX9-NEXT: s_lshl_b32 s19, s94, 8 ; GFX9-NEXT: s_or_b32 s18, s18, s19 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s18, s18, 16 @@ -172550,11 +171902,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s16, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s81, 8 +; GFX9-NEXT: s_lshl_b32 s17, s49, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 -; GFX9-NEXT: v_readlane_b32 s17, v21, 1 -; GFX9-NEXT: s_and_b32 s17, s17, 0xff -; GFX9-NEXT: s_lshl_b32 s18, s64, 8 +; GFX9-NEXT: s_and_b32 s17, s63, 0xff +; GFX9-NEXT: s_lshl_b32 s18, s46, 8 ; GFX9-NEXT: s_or_b32 s17, s17, s18 ; GFX9-NEXT: s_and_b32 s16, s16, 0xffff ; GFX9-NEXT: s_lshl_b32 s17, s17, 16 @@ -172562,10 +171913,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: s_and_b32 s14, s14, 0xff -; GFX9-NEXT: s_lshl_b32 s16, s70, 8 +; GFX9-NEXT: s_lshl_b32 s16, s81, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s16 -; GFX9-NEXT: s_and_b32 s16, s52, 0xff -; GFX9-NEXT: s_lshl_b32 s17, s78, 8 +; GFX9-NEXT: s_and_b32 s16, s98, 0xff +; GFX9-NEXT: s_lshl_b32 s17, s90, 8 ; GFX9-NEXT: s_or_b32 s16, s16, s17 ; GFX9-NEXT: s_and_b32 s14, s14, 0xffff ; GFX9-NEXT: s_lshl_b32 s16, s16, 16 @@ -172573,11 +171924,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-NEXT: s_and_b32 s14, s15, 0xff -; GFX9-NEXT: s_lshl_b32 s15, s55, 8 +; GFX9-NEXT: s_lshl_b32 s15, s96, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s15 -; GFX9-NEXT: v_readlane_b32 s15, v21, 2 -; GFX9-NEXT: s_and_b32 s15, s15, 0xff -; GFX9-NEXT: s_lshl_b32 s16, s49, 8 +; GFX9-NEXT: s_and_b32 s15, s73, 0xff +; GFX9-NEXT: s_lshl_b32 s16, s80, 8 ; GFX9-NEXT: s_or_b32 s15, s15, s16 ; GFX9-NEXT: s_and_b32 s14, s14, 0xffff ; GFX9-NEXT: s_lshl_b32 s15, s15, 16 @@ -172585,10 +171935,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 ; GFX9-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-NEXT: s_and_b32 s12, s12, 0xff -; GFX9-NEXT: s_lshl_b32 s14, s39, 8 +; GFX9-NEXT: s_lshl_b32 s14, s69, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s14 -; GFX9-NEXT: s_and_b32 s14, s54, 0xff -; GFX9-NEXT: s_lshl_b32 s15, s74, 8 +; GFX9-NEXT: s_and_b32 s14, s67, 0xff +; GFX9-NEXT: s_lshl_b32 s15, s88, 8 ; GFX9-NEXT: s_or_b32 s14, s14, s15 ; GFX9-NEXT: s_and_b32 s12, s12, 0xffff ; GFX9-NEXT: s_lshl_b32 s14, s14, 16 @@ -172596,25 +171946,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 ; GFX9-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-NEXT: s_and_b32 s12, s13, 0xff -; GFX9-NEXT: v_readlane_b32 s13, v21, 39 -; GFX9-NEXT: s_lshl_b32 s13, s13, 8 +; GFX9-NEXT: s_lshl_b32 s13, s66, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s13 -; GFX9-NEXT: v_readlane_b32 s13, v21, 3 -; GFX9-NEXT: s_and_b32 s13, s13, 0xff -; GFX9-NEXT: s_lshl_b32 s14, s63, 8 +; GFX9-NEXT: s_and_b32 s13, s75, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s65, 8 ; GFX9-NEXT: s_or_b32 s13, s13, s14 ; GFX9-NEXT: s_and_b32 s12, s12, 0xffff ; GFX9-NEXT: s_lshl_b32 s13, s13, 16 ; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 ; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_readlane_b32 s12, v21, 38 +; GFX9-NEXT: v_readlane_b32 s12, v21, 42 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshl_b32 s12, s12, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s12 -; GFX9-NEXT: v_readlane_b32 s12, v21, 37 +; GFX9-NEXT: v_readlane_b32 s12, v21, 41 ; GFX9-NEXT: s_and_b32 s12, s12, 0xff -; GFX9-NEXT: s_lshl_b32 s13, s62, 8 +; GFX9-NEXT: s_lshl_b32 s13, s78, 8 ; GFX9-NEXT: s_or_b32 s12, s12, s13 ; GFX9-NEXT: s_and_b32 s10, s10, 0xffff ; GFX9-NEXT: s_lshl_b32 s12, s12, 16 @@ -172622,25 +171970,24 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NEXT: s_and_b32 s10, s11, 0xff -; GFX9-NEXT: v_readlane_b32 s11, v21, 36 +; GFX9-NEXT: v_readlane_b32 s11, v21, 40 ; GFX9-NEXT: s_lshl_b32 s11, s11, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s11 -; GFX9-NEXT: v_readlane_b32 s11, v21, 4 -; GFX9-NEXT: s_and_b32 s11, s11, 0xff -; GFX9-NEXT: s_lshl_b32 s12, s61, 8 +; GFX9-NEXT: s_and_b32 s11, s77, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s79, 8 ; GFX9-NEXT: s_or_b32 s11, s11, s12 ; GFX9-NEXT: s_and_b32 s10, s10, 0xffff ; GFX9-NEXT: s_lshl_b32 s11, s11, 16 ; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 ; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: v_readlane_b32 s10, v21, 35 +; GFX9-NEXT: v_readlane_b32 s10, v21, 39 ; GFX9-NEXT: s_and_b32 s8, s8, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s10 -; GFX9-NEXT: v_readlane_b32 s10, v21, 34 +; GFX9-NEXT: v_readlane_b32 s10, v21, 38 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff -; GFX9-NEXT: s_lshl_b32 s11, s60, 8 +; GFX9-NEXT: s_lshl_b32 s11, s72, 8 ; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 @@ -172648,12 +171995,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: s_and_b32 s8, s9, 0xff -; GFX9-NEXT: v_readlane_b32 s9, v21, 33 +; GFX9-NEXT: v_readlane_b32 s9, v21, 37 ; GFX9-NEXT: s_lshl_b32 s9, s9, 8 +; GFX9-NEXT: v_readlane_b32 s10, v21, 36 ; GFX9-NEXT: s_or_b32 s8, s8, s9 -; GFX9-NEXT: v_readlane_b32 s9, v21, 5 -; GFX9-NEXT: v_readlane_b32 s10, v21, 32 -; GFX9-NEXT: s_and_b32 s9, s9, 0xff +; GFX9-NEXT: s_and_b32 s9, s93, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s10, 8 ; GFX9-NEXT: s_or_b32 s9, s9, s10 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff @@ -172661,11 +172007,13 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_readlane_b32 s8, v21, 35 ; GFX9-NEXT: s_and_b32 s6, s6, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s37, 8 +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: s_and_b32 s8, s35, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s56, 8 +; GFX9-NEXT: v_readlane_b32 s8, v21, 34 +; GFX9-NEXT: s_and_b32 s8, s8, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s60, 8 ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 @@ -172673,22 +172021,26 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_and_b32 s6, s7, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s31, 8 +; GFX9-NEXT: v_readlane_b32 s7, v21, 33 +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: v_readlane_b32 s8, v21, 32 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_readlane_b32 s7, v21, 6 -; GFX9-NEXT: s_and_b32 s7, s7, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s95, 8 +; GFX9-NEXT: s_and_b32 s7, s35, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_readlane_b32 s6, v21, 31 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s91, 8 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: s_and_b32 s6, s77, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s46, 8 +; GFX9-NEXT: v_readlane_b32 s6, v21, 30 +; GFX9-NEXT: v_readlane_b32 s8, v21, 4 +; GFX9-NEXT: s_and_b32 s6, s6, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s8, 8 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 @@ -172696,17 +172048,21 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_and_b32 s4, s5, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s73, 8 +; GFX9-NEXT: v_readlane_b32 s5, v21, 29 +; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: v_readlane_b32 s6, v21, 28 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_readlane_b32 s5, v21, 7 -; GFX9-NEXT: s_and_b32 s5, s5, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s59, 8 +; GFX9-NEXT: s_and_b32 s5, s39, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_readlane_b32 s25, v21, 1 +; GFX9-NEXT: v_readlane_b32 s23, v21, 3 +; GFX9-NEXT: v_readlane_b32 s9, v21, 5 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: v_readlane_b32 s99, v20, 35 ; GFX9-NEXT: v_readlane_b32 s98, v20, 34 @@ -172753,113 +172109,111 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: .LBB91_4: ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; kill: killed $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr47 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; kill: killed $sgpr61 +; GFX9-NEXT: ; implicit-def: $vcc_lo +; GFX9-NEXT: ; implicit-def: $sgpr57 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr38 +; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr51 ; GFX9-NEXT: ; implicit-def: $sgpr99 -; GFX9-NEXT: ; implicit-def: $sgpr97 -; GFX9-NEXT: ; implicit-def: $sgpr66 -; GFX9-NEXT: ; implicit-def: $sgpr85 -; GFX9-NEXT: ; implicit-def: $sgpr65 -; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: ; implicit-def: $sgpr80 -; GFX9-NEXT: ; implicit-def: $sgpr71 ; GFX9-NEXT: ; implicit-def: $sgpr53 -; GFX9-NEXT: ; implicit-def: $sgpr51 -; GFX9-NEXT: ; implicit-def: $sgpr50 +; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr48 -; GFX9-NEXT: ; implicit-def: $sgpr38 -; GFX9-NEXT: ; implicit-def: $sgpr68 -; GFX9-NEXT: ; implicit-def: $sgpr69 -; GFX9-NEXT: ; implicit-def: $sgpr98 -; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr54 +; GFX9-NEXT: ; implicit-def: $sgpr97 ; GFX9-NEXT: ; implicit-def: $sgpr87 ; GFX9-NEXT: ; implicit-def: $sgpr86 -; GFX9-NEXT: ; implicit-def: $sgpr84 -; GFX9-NEXT: ; implicit-def: $sgpr83 -; GFX9-NEXT: ; implicit-def: $sgpr81 -; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr68 ; GFX9-NEXT: ; implicit-def: $sgpr70 -; GFX9-NEXT: ; implicit-def: $sgpr52 +; GFX9-NEXT: ; implicit-def: $sgpr71 +; GFX9-NEXT: ; implicit-def: $sgpr64 ; GFX9-NEXT: ; implicit-def: $sgpr55 -; GFX9-NEXT: ; implicit-def: $sgpr49 -; GFX9-NEXT: ; implicit-def: $sgpr39 -; GFX9-NEXT: ; implicit-def: $sgpr54 -; GFX9-NEXT: ; implicit-def: $sgpr63 -; GFX9-NEXT: ; implicit-def: $sgpr61 -; GFX9-NEXT: ; implicit-def: $sgpr37 -; GFX9-NEXT: ; implicit-def: $sgpr35 -; GFX9-NEXT: ; implicit-def: $sgpr31 -; GFX9-NEXT: ; implicit-def: $sgpr95 -; GFX9-NEXT: ; implicit-def: $sgpr91 -; GFX9-NEXT: ; implicit-def: $sgpr77 -; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr85 ; GFX9-NEXT: ; implicit-def: $sgpr59 -; GFX9-NEXT: ; implicit-def: $sgpr36 -; GFX9-NEXT: ; implicit-def: $sgpr34 +; GFX9-NEXT: ; implicit-def: $sgpr58 ; GFX9-NEXT: ; implicit-def: $sgpr30 +; GFX9-NEXT: ; implicit-def: $sgpr37 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr56 +; GFX9-NEXT: ; implicit-def: $sgpr82 +; GFX9-NEXT: ; implicit-def: $sgpr52 ; GFX9-NEXT: ; implicit-def: $sgpr94 +; GFX9-NEXT: ; implicit-def: $sgpr49 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr98 ; GFX9-NEXT: ; implicit-def: $sgpr90 -; GFX9-NEXT: ; implicit-def: $sgpr76 -; GFX9-NEXT: ; implicit-def: $sgpr72 -; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr96 +; GFX9-NEXT: ; implicit-def: $sgpr73 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr67 ; GFX9-NEXT: ; implicit-def: $sgpr88 -; GFX9-NEXT: ; implicit-def: $sgpr92 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr75 +; GFX9-NEXT: ; implicit-def: $sgpr65 ; GFX9-NEXT: ; implicit-def: $sgpr78 -; GFX9-NEXT: ; implicit-def: $sgpr74 -; GFX9-NEXT: ; implicit-def: $sgpr62 -; GFX9-NEXT: ; implicit-def: $sgpr60 -; GFX9-NEXT: ; implicit-def: $sgpr56 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 -; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr77 +; GFX9-NEXT: ; implicit-def: $sgpr79 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr93 +; GFX9-NEXT: ; implicit-def: $sgpr35 +; GFX9-NEXT: ; implicit-def: $sgpr39 ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; kill: killed $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; kill: killed $sgpr61 ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; kill: killed $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; kill: killed $sgpr61 ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; kill: killed $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; kill: killed $sgpr61 ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; kill: killed $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; kill: killed $sgpr61 ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; kill: killed $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; kill: killed $sgpr61 ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; kill: killed $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; kill: killed $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; kill: killed $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: ; implicit-def: $sgpr60 ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; kill: killed $sgpr46 ; GFX9-NEXT: ; implicit-def: $sgpr46 @@ -172871,9 +172225,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; kill: killed $sgpr46 ; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: v_writelane_b32 v21, s46, 0 ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: v_writelane_b32 v21, s47, 1 ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; kill: killed $sgpr46 ; GFX9-NEXT: ; implicit-def: $sgpr46 @@ -172881,10 +172236,13 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: ; kill: killed $sgpr46 ; GFX9-NEXT: ; implicit-def: $sgpr46 -; GFX9-NEXT: ; kill: killed $sgpr46 +; GFX9-NEXT: v_writelane_b32 v21, s46, 2 +; GFX9-NEXT: v_writelane_b32 v21, s47, 3 ; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: v_writelane_b32 v21, vcc_lo, 4 ; GFX9-NEXT: ; kill: killed $sgpr46 ; GFX9-NEXT: ; implicit-def: $sgpr46 +; GFX9-NEXT: v_writelane_b32 v21, vcc_hi, 5 ; GFX9-NEXT: ; kill: killed $sgpr46 ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: s_branch .LBB91_2 @@ -172893,10 +172251,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x2 ; 12-byte Folded Spill +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v18, s32 ; GFX11-NEXT: scratch_store_b32 off, v19, s32 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v20, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v21, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s4 ; GFX11-NEXT: v_writelane_b32 v18, s30, 0 ; GFX11-NEXT: v_writelane_b32 v19, s96, 0 @@ -172931,13 +172290,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: v_writelane_b32 v18, s38, 6 ; GFX11-NEXT: v_writelane_b32 v19, s102, 6 ; GFX11-NEXT: v_readfirstlane_b32 s5, v14 +; GFX11-NEXT: s_mov_b32 s34, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: ; implicit-def: $vgpr20 : SGPR spill to VGPR lane ; GFX11-NEXT: v_writelane_b32 v18, s39, 7 ; GFX11-NEXT: v_writelane_b32 v19, s103, 7 +; GFX11-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; GFX11-NEXT: ; implicit-def: $vgpr20 : SGPR spill to VGPR lane ; GFX11-NEXT: v_writelane_b32 v18, s48, 8 ; GFX11-NEXT: v_writelane_b32 v19, s104, 8 -; GFX11-NEXT: s_mov_b32 s104, 0 ; GFX11-NEXT: v_writelane_b32 v18, s49, 9 ; GFX11-NEXT: v_writelane_b32 v18, s50, 10 ; GFX11-NEXT: v_writelane_b32 v18, s51, 11 @@ -172961,290 +172321,150 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: v_writelane_b32 v18, s85, 29 ; GFX11-NEXT: v_writelane_b32 v18, s86, 30 ; GFX11-NEXT: v_writelane_b32 v18, s87, 31 -; GFX11-NEXT: s_cbranch_scc0 .LBB91_2 +; GFX11-NEXT: s_cbranch_scc0 .LBB91_4 ; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: s_lshr_b32 s42, s25, 16 -; GFX11-NEXT: s_lshr_b32 s44, s28, 16 -; GFX11-NEXT: v_writelane_b32 v20, s42, 20 +; GFX11-NEXT: s_lshr_b32 s42, s25, 24 +; GFX11-NEXT: s_lshr_b32 s57, s29, 16 +; GFX11-NEXT: v_writelane_b32 v21, s42, 10 +; GFX11-NEXT: s_lshr_b32 s42, s25, 8 +; GFX11-NEXT: s_lshr_b32 s56, s41, 16 +; GFX11-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 +; GFX11-NEXT: v_writelane_b32 v21, s42, 11 +; GFX11-NEXT: s_lshr_b32 s42, s24, 16 +; GFX11-NEXT: s_mov_b32 s61, s56 +; GFX11-NEXT: s_mov_b32 s63, s57 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; GFX11-NEXT: v_writelane_b32 v21, s42, 12 +; GFX11-NEXT: s_lshr_b32 s42, s24, 8 +; GFX11-NEXT: s_lshr_b32 s73, s9, 16 +; GFX11-NEXT: s_lshr_b32 s72, s11, 16 +; GFX11-NEXT: s_lshr_b32 s92, s7, 16 +; GFX11-NEXT: v_writelane_b32 v21, s42, 13 +; GFX11-NEXT: s_lshr_b32 s42, s23, 24 +; GFX11-NEXT: s_lshr_b32 s79, s13, 16 +; GFX11-NEXT: s_lshr_b32 s78, s15, 16 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 +; GFX11-NEXT: v_writelane_b32 v21, s42, 14 ; GFX11-NEXT: s_lshr_b32 s42, s23, 16 -; GFX11-NEXT: s_lshr_b32 s46, s41, 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[26:27], 24 -; GFX11-NEXT: s_mov_b32 s91, s46 -; GFX11-NEXT: v_writelane_b32 v20, s42, 19 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 +; GFX11-NEXT: s_mov_b32 s75, s72 +; GFX11-NEXT: s_mov_b32 s77, s73 +; GFX11-NEXT: v_writelane_b32 v21, s42, 9 +; GFX11-NEXT: s_lshr_b32 s42, s23, 8 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[2:3], 24 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[0:1], 24 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[12:13], 24 +; GFX11-NEXT: v_writelane_b32 v21, s42, 15 +; GFX11-NEXT: s_lshr_b32 s42, s22, 16 +; GFX11-NEXT: s_lshr_b32 s35, s27, 24 +; GFX11-NEXT: s_lshr_b32 s68, s27, 16 +; GFX11-NEXT: s_lshr_b32 s52, s27, 8 +; GFX11-NEXT: v_writelane_b32 v21, s42, 16 ; GFX11-NEXT: s_lshr_b32 s42, s21, 16 -; GFX11-NEXT: s_lshr_b64 s[46:47], s[24:25], 24 -; GFX11-NEXT: s_lshr_b32 s72, s7, 24 -; GFX11-NEXT: s_lshr_b32 s73, s7, 8 -; GFX11-NEXT: v_writelane_b32 v20, s42, 18 +; GFX11-NEXT: s_lshr_b32 s37, s26, 16 +; GFX11-NEXT: s_lshr_b32 s70, s26, 8 +; GFX11-NEXT: s_lshr_b32 s67, s25, 16 +; GFX11-NEXT: v_writelane_b32 v21, s42, 8 ; GFX11-NEXT: s_lshr_b32 s42, s19, 16 -; GFX11-NEXT: s_lshr_b32 s77, s27, 24 -; GFX11-NEXT: s_lshr_b32 s76, s6, 16 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[16:17], 24 -; GFX11-NEXT: v_writelane_b32 v20, s42, 17 +; GFX11-NEXT: s_lshr_b32 s38, s22, 8 +; GFX11-NEXT: s_lshr_b32 s39, s21, 24 +; GFX11-NEXT: s_lshr_b32 s48, s21, 8 +; GFX11-NEXT: v_writelane_b32 v21, s42, 7 ; GFX11-NEXT: s_lshr_b32 s42, s17, 16 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[2:3], 24 -; GFX11-NEXT: s_lshr_b32 s88, s27, 8 -; GFX11-NEXT: s_mov_b32 s79, s72 -; GFX11-NEXT: v_writelane_b32 v20, s42, 16 -; GFX11-NEXT: s_lshr_b32 s42, s3, 24 -; GFX11-NEXT: s_mov_b32 s93, s73 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[0:1], 24 -; GFX11-NEXT: s_lshr_b64 s[72:73], s[12:13], 24 -; GFX11-NEXT: v_writelane_b32 v20, s42, 21 -; GFX11-NEXT: s_lshr_b32 s42, s3, 16 -; GFX11-NEXT: s_lshr_b32 vcc_lo, s26, 16 -; GFX11-NEXT: s_lshr_b32 s56, s4, 16 -; GFX11-NEXT: s_lshr_b32 s57, s4, 8 -; GFX11-NEXT: v_writelane_b32 v20, s42, 15 -; GFX11-NEXT: s_lshr_b32 s42, s3, 8 -; GFX11-NEXT: s_mov_b32 s95, s76 -; GFX11-NEXT: s_mov_b32 s73, s77 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[14:15], 24 -; GFX11-NEXT: v_writelane_b32 v20, s42, 22 -; GFX11-NEXT: s_lshr_b32 s42, s2, 16 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 -; GFX11-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 -; GFX11-NEXT: s_mov_b32 s77, s88 -; GFX11-NEXT: v_writelane_b32 v20, s42, 23 -; GFX11-NEXT: s_lshr_b32 s42, s2, 8 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[28:29], 24 -; GFX11-NEXT: s_lshr_b32 s71, s27, 16 -; GFX11-NEXT: s_lshr_b32 s83, s26, 8 -; GFX11-NEXT: v_writelane_b32 v20, s42, 24 -; GFX11-NEXT: s_lshr_b32 s42, s1, 24 -; GFX11-NEXT: s_lshr_b32 s66, s25, 24 -; GFX11-NEXT: s_lshr_b32 s67, s25, 8 -; GFX11-NEXT: s_lshr_b32 s68, s24, 16 -; GFX11-NEXT: v_writelane_b32 v20, s42, 25 -; GFX11-NEXT: s_lshr_b32 s42, s1, 16 -; GFX11-NEXT: s_lshr_b32 s49, s24, 8 -; GFX11-NEXT: s_lshr_b32 s69, s23, 24 -; GFX11-NEXT: s_lshr_b32 s70, s23, 8 -; GFX11-NEXT: v_writelane_b32 v20, s42, 14 -; GFX11-NEXT: s_lshr_b32 s42, s1, 8 -; GFX11-NEXT: s_lshr_b32 s64, s22, 16 -; GFX11-NEXT: s_lshr_b32 s80, s22, 8 -; GFX11-NEXT: s_lshr_b32 s58, s21, 24 -; GFX11-NEXT: v_writelane_b32 v20, s42, 26 +; GFX11-NEXT: s_lshr_b32 s49, s20, 16 +; GFX11-NEXT: s_lshr_b32 s50, s20, 8 +; GFX11-NEXT: s_lshr_b32 s69, s19, 24 +; GFX11-NEXT: v_writelane_b32 v21, s42, 6 ; GFX11-NEXT: s_lshr_b32 s42, s5, 24 -; GFX11-NEXT: s_lshr_b32 s59, s21, 8 -; GFX11-NEXT: s_lshr_b32 s50, s20, 16 -; GFX11-NEXT: s_lshr_b32 s81, s20, 8 -; GFX11-NEXT: v_writelane_b32 v20, s42, 27 +; GFX11-NEXT: s_lshr_b32 s71, s19, 8 +; GFX11-NEXT: s_lshr_b32 s81, s18, 16 +; GFX11-NEXT: s_lshr_b32 s82, s18, 8 +; GFX11-NEXT: v_writelane_b32 v21, s42, 18 ; GFX11-NEXT: s_lshr_b32 s42, s5, 16 -; GFX11-NEXT: s_lshr_b32 s85, s19, 24 -; GFX11-NEXT: s_lshr_b32 s60, s19, 8 -; GFX11-NEXT: s_lshr_b32 s61, s18, 16 -; GFX11-NEXT: v_writelane_b32 v20, s42, 13 +; GFX11-NEXT: s_lshr_b32 s54, s17, 24 +; GFX11-NEXT: s_lshr_b32 s85, s17, 8 +; GFX11-NEXT: s_lshr_b32 s87, s16, 16 +; GFX11-NEXT: v_writelane_b32 v21, s42, 17 ; GFX11-NEXT: s_lshr_b32 s42, s5, 8 -; GFX11-NEXT: s_lshr_b32 s96, s18, 8 -; GFX11-NEXT: s_lshr_b32 s98, s17, 24 -; GFX11-NEXT: s_lshr_b32 s99, s17, 8 -; GFX11-NEXT: v_writelane_b32 v20, s42, 28 -; GFX11-NEXT: s_lshr_b32 s42, s7, 16 -; GFX11-NEXT: s_lshr_b32 s53, s16, 16 -; GFX11-NEXT: s_lshr_b32 s43, s16, 8 -; GFX11-NEXT: s_lshr_b32 s102, s0, 16 -; GFX11-NEXT: v_writelane_b32 v20, s42, 12 -; GFX11-NEXT: s_lshr_b32 s42, s9, 16 -; GFX11-NEXT: s_lshr_b32 s103, s0, 8 -; GFX11-NEXT: s_lshr_b32 s34, s6, 8 -; GFX11-NEXT: s_lshr_b32 s65, s9, 24 -; GFX11-NEXT: v_writelane_b32 v20, s42, 11 -; GFX11-NEXT: s_lshr_b32 s42, s11, 16 -; GFX11-NEXT: s_lshr_b32 s55, s9, 8 -; GFX11-NEXT: s_lshr_b32 s35, s8, 16 -; GFX11-NEXT: s_lshr_b32 s36, s8, 8 -; GFX11-NEXT: v_writelane_b32 v20, s42, 10 -; GFX11-NEXT: s_lshr_b32 s42, s13, 16 -; GFX11-NEXT: s_lshr_b32 s37, s11, 24 -; GFX11-NEXT: s_lshr_b32 s38, s11, 8 -; GFX11-NEXT: s_lshr_b32 s39, s10, 16 -; GFX11-NEXT: v_writelane_b32 v20, s42, 9 -; GFX11-NEXT: s_lshr_b32 s42, s15, 16 -; GFX11-NEXT: s_lshr_b32 s48, s10, 8 -; GFX11-NEXT: s_lshr_b32 s84, s13, 24 -; GFX11-NEXT: s_lshr_b32 s82, s13, 8 -; GFX11-NEXT: v_writelane_b32 v20, s42, 8 -; GFX11-NEXT: s_lshr_b32 s86, s12, 16 -; GFX11-NEXT: s_lshr_b32 s51, s12, 8 -; GFX11-NEXT: s_lshr_b32 s97, s15, 24 -; GFX11-NEXT: s_lshr_b32 s87, s15, 8 -; GFX11-NEXT: v_writelane_b32 v20, s44, 29 +; GFX11-NEXT: s_lshr_b32 s55, s16, 8 +; GFX11-NEXT: s_lshr_b32 s97, s3, 24 +; GFX11-NEXT: s_lshr_b32 s66, s3, 16 +; GFX11-NEXT: v_writelane_b32 v21, s42, 19 +; GFX11-NEXT: s_lshr_b32 s42, s4, 16 +; GFX11-NEXT: s_lshr_b32 s99, s3, 8 +; GFX11-NEXT: s_lshr_b32 s100, s2, 16 +; GFX11-NEXT: s_lshr_b32 s64, s2, 8 +; GFX11-NEXT: v_writelane_b32 v21, s42, 20 +; GFX11-NEXT: s_lshr_b32 s42, s4, 8 +; GFX11-NEXT: s_lshr_b32 s101, s1, 24 +; GFX11-NEXT: s_lshr_b32 vcc_hi, s1, 16 +; GFX11-NEXT: s_lshr_b32 s102, s1, 8 +; GFX11-NEXT: v_writelane_b32 v21, s42, 21 +; GFX11-NEXT: s_lshr_b32 s42, s7, 24 +; GFX11-NEXT: s_lshr_b32 s103, s0, 16 +; GFX11-NEXT: s_lshr_b32 s104, s0, 8 +; GFX11-NEXT: s_lshr_b32 s51, s13, 24 +; GFX11-NEXT: v_writelane_b32 v21, s42, 22 +; GFX11-NEXT: s_lshr_b32 s42, s7, 8 +; GFX11-NEXT: s_lshr_b32 s36, s13, 8 +; GFX11-NEXT: s_lshr_b32 s53, s12, 16 +; GFX11-NEXT: s_lshr_b32 s80, s12, 8 +; GFX11-NEXT: v_writelane_b32 v21, s42, 23 +; GFX11-NEXT: s_lshr_b32 s42, s6, 16 +; GFX11-NEXT: s_lshr_b32 s83, s15, 24 +; GFX11-NEXT: s_lshr_b32 s84, s15, 8 +; GFX11-NEXT: s_lshr_b32 s86, s14, 16 +; GFX11-NEXT: v_writelane_b32 v21, s42, 24 +; GFX11-NEXT: s_lshr_b32 s42, s6, 8 +; GFX11-NEXT: s_lshr_b32 s96, s14, 8 +; GFX11-NEXT: s_lshr_b32 s98, s29, 24 +; GFX11-NEXT: s_lshr_b32 s43, s28, 16 +; GFX11-NEXT: v_writelane_b32 v21, s42, 25 +; GFX11-NEXT: s_lshr_b32 s42, s9, 24 ; GFX11-NEXT: s_lshr_b32 s44, s28, 8 -; GFX11-NEXT: s_lshr_b32 s52, s14, 16 -; GFX11-NEXT: s_lshr_b32 s100, s14, 8 -; GFX11-NEXT: s_lshr_b32 s42, s29, 24 -; GFX11-NEXT: v_writelane_b32 v20, s44, 30 -; GFX11-NEXT: s_lshr_b32 s44, s41, 8 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s29, 16 -; GFX11-NEXT: s_lshr_b32 s101, s29, 8 -; GFX11-NEXT: s_lshr_b32 s54, s41, 16 -; GFX11-NEXT: v_writelane_b32 v20, s44, 31 -; GFX11-NEXT: s_lshr_b32 s44, s40, 16 -; GFX11-NEXT: s_lshr_b32 s45, s40, 8 -; GFX11-NEXT: s_mov_b32 s63, s56 -; GFX11-NEXT: s_mov_b32 s75, s57 -; GFX11-NEXT: v_writelane_b32 v20, s46, 2 -; GFX11-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 -; GFX11-NEXT: s_mov_b32 s89, vcc_lo +; GFX11-NEXT: s_lshr_b32 s45, s41, 24 +; GFX11-NEXT: s_lshr_b32 s65, s41, 8 +; GFX11-NEXT: v_writelane_b32 v21, s42, 26 +; GFX11-NEXT: s_lshr_b32 s42, s9, 8 +; GFX11-NEXT: s_lshr_b32 s46, s40, 16 +; GFX11-NEXT: s_lshr_b32 s47, s40, 8 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[26:27], 24 +; GFX11-NEXT: v_writelane_b32 v21, s42, 27 +; GFX11-NEXT: s_lshr_b32 s42, s8, 16 +; GFX11-NEXT: s_lshr_b64 s[58:59], s[24:25], 24 +; GFX11-NEXT: s_mov_b32 s89, s79 +; GFX11-NEXT: s_mov_b32 s91, s92 +; GFX11-NEXT: v_writelane_b32 v21, s42, 28 +; GFX11-NEXT: s_lshr_b32 s42, s8, 8 +; GFX11-NEXT: s_mov_b32 s73, s78 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[14:15], 24 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[28:29], 24 +; GFX11-NEXT: v_writelane_b32 v21, s42, 29 +; GFX11-NEXT: s_lshr_b32 s42, s11, 24 ; GFX11-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v20, s47, 3 -; GFX11-NEXT: s_lshr_b64 s[46:47], s[22:23], 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v20, s46, 0 -; GFX11-NEXT: v_writelane_b32 v20, s47, 1 -; GFX11-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; GFX11-NEXT: v_writelane_b32 v20, s46, 6 -; GFX11-NEXT: v_writelane_b32 v20, s47, 7 -; GFX11-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v20, s46, 4 -; GFX11-NEXT: v_writelane_b32 v20, s47, 5 -; GFX11-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 -; GFX11-NEXT: s_branch .LBB91_3 -; GFX11-NEXT: .LBB91_2: -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: ; kill: killed $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr43 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; implicit-def: $vcc_hi -; GFX11-NEXT: s_mov_b32 s104, -1 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: ; kill: killed $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr103 -; GFX11-NEXT: ; implicit-def: $sgpr102 -; GFX11-NEXT: ; implicit-def: $sgpr94 -; GFX11-NEXT: ; implicit-def: $sgpr92 -; GFX11-NEXT: ; implicit-def: $sgpr53 -; GFX11-NEXT: ; implicit-def: $sgpr78 -; GFX11-NEXT: ; implicit-def: $sgpr99 -; GFX11-NEXT: ; implicit-def: $sgpr98 -; GFX11-NEXT: ; implicit-def: $sgpr96 -; GFX11-NEXT: ; implicit-def: $sgpr61 -; GFX11-NEXT: ; implicit-def: $sgpr74 -; GFX11-NEXT: ; implicit-def: $sgpr60 -; GFX11-NEXT: ; implicit-def: $sgpr85 -; GFX11-NEXT: ; implicit-def: $sgpr81 -; GFX11-NEXT: ; implicit-def: $sgpr50 -; GFX11-NEXT: ; implicit-def: $sgpr62 -; GFX11-NEXT: ; implicit-def: $sgpr59 -; GFX11-NEXT: ; implicit-def: $sgpr58 -; GFX11-NEXT: ; implicit-def: $sgpr80 -; GFX11-NEXT: ; implicit-def: $sgpr64 -; GFX11-NEXT: ; implicit-def: $sgpr70 -; GFX11-NEXT: ; implicit-def: $sgpr69 -; GFX11-NEXT: ; implicit-def: $sgpr49 -; GFX11-NEXT: ; implicit-def: $sgpr68 -; GFX11-NEXT: ; implicit-def: $sgpr67 -; GFX11-NEXT: ; implicit-def: $sgpr66 -; GFX11-NEXT: ; implicit-def: $sgpr83 -; GFX11-NEXT: ; implicit-def: $sgpr89 -; GFX11-NEXT: ; implicit-def: $sgpr77 -; GFX11-NEXT: ; implicit-def: $sgpr71 -; GFX11-NEXT: ; implicit-def: $sgpr73 -; GFX11-NEXT: ; implicit-def: $sgpr45 -; GFX11-NEXT: ; implicit-def: $sgpr44 -; GFX11-NEXT: ; implicit-def: $sgpr54 -; GFX11-NEXT: ; implicit-def: $sgpr91 -; GFX11-NEXT: ; implicit-def: $sgpr101 -; GFX11-NEXT: ; implicit-def: $sgpr100 -; GFX11-NEXT: ; implicit-def: $sgpr52 -; GFX11-NEXT: ; implicit-def: $sgpr87 -; GFX11-NEXT: ; implicit-def: $sgpr97 -; GFX11-NEXT: ; implicit-def: $sgpr51 -; GFX11-NEXT: ; implicit-def: $sgpr86 -; GFX11-NEXT: ; implicit-def: $sgpr82 -; GFX11-NEXT: ; implicit-def: $sgpr84 -; GFX11-NEXT: ; implicit-def: $sgpr48 -; GFX11-NEXT: ; implicit-def: $sgpr39 -; GFX11-NEXT: ; implicit-def: $sgpr38 -; GFX11-NEXT: ; implicit-def: $sgpr37 -; GFX11-NEXT: ; implicit-def: $sgpr36 -; GFX11-NEXT: ; implicit-def: $sgpr35 -; GFX11-NEXT: ; implicit-def: $sgpr55 -; GFX11-NEXT: ; implicit-def: $sgpr65 -; GFX11-NEXT: ; implicit-def: $sgpr34 -; GFX11-NEXT: ; implicit-def: $sgpr95 -; GFX11-NEXT: ; implicit-def: $sgpr93 -; GFX11-NEXT: ; implicit-def: $sgpr79 -; GFX11-NEXT: ; implicit-def: $sgpr75 -; GFX11-NEXT: ; implicit-def: $sgpr63 -; GFX11-NEXT: ; implicit-def: $sgpr90 -; GFX11-NEXT: ; implicit-def: $sgpr30 -; GFX11-NEXT: ; implicit-def: $sgpr88 -; GFX11-NEXT: ; implicit-def: $sgpr76 -; GFX11-NEXT: ; implicit-def: $sgpr72 -; GFX11-NEXT: ; implicit-def: $sgpr56 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: ; kill: killed $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: ; kill: killed $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: ; kill: killed $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: ; kill: killed $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: ; kill: killed $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: ; kill: killed $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: v_writelane_b32 v21, s42, 30 +; GFX11-NEXT: s_lshr_b32 s42, s11, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s42, 31 +; GFX11-NEXT: s_lshr_b32 s42, s10, 16 ; GFX11-NEXT: v_writelane_b32 v20, s42, 0 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v20, s43, 1 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v20, s46, 2 -; GFX11-NEXT: ; implicit-def: $sgpr46 -; GFX11-NEXT: v_writelane_b32 v20, s47, 3 -; GFX11-NEXT: v_writelane_b32 v20, vcc_lo, 4 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v20, vcc_hi, 5 -; GFX11-NEXT: v_writelane_b32 v20, vcc_lo, 6 -; GFX11-NEXT: v_writelane_b32 v20, vcc_hi, 7 -; GFX11-NEXT: .LBB91_3: ; %Flow -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s104 -; GFX11-NEXT: s_mov_b32 s104, s54 -; GFX11-NEXT: s_mov_b32 s54, vcc_hi -; GFX11-NEXT: s_mov_b32 vcc_hi, s34 -; GFX11-NEXT: s_mov_b32 s34, s65 -; GFX11-NEXT: s_mov_b32 s65, s84 -; GFX11-NEXT: s_mov_b32 s84, s86 -; GFX11-NEXT: s_mov_b32 s86, s97 -; GFX11-NEXT: s_mov_b32 s97, s43 -; GFX11-NEXT: s_cbranch_vccnz .LBB91_5 -; GFX11-NEXT: ; %bb.4: ; %cmp.true +; GFX11-NEXT: s_lshr_b32 s42, s10, 8 +; GFX11-NEXT: v_writelane_b32 v21, s56, 4 +; GFX11-NEXT: v_writelane_b32 v20, s42, 1 +; GFX11-NEXT: s_lshr_b32 s42, s29, 8 +; GFX11-NEXT: v_writelane_b32 v21, s57, 5 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s56, 2 +; GFX11-NEXT: v_writelane_b32 v21, s57, 3 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[8:9], 24 +; GFX11-NEXT: v_writelane_b32 v21, s56, 0 +; GFX11-NEXT: v_writelane_b32 v21, s57, 1 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s34 +; GFX11-NEXT: s_cbranch_vccnz .LBB91_3 +; GFX11-NEXT: .LBB91_2: ; %cmp.true ; GFX11-NEXT: s_and_b32 s42, s41, 0xffff0000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s42 @@ -173258,7 +172478,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s42, s42, s43 ; GFX11-NEXT: s_lshl_b32 s41, s41, 16 -; GFX11-NEXT: s_lshr_b32 s104, s42, 16 +; GFX11-NEXT: s_lshr_b32 s76, s42, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s41 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s41, v1 @@ -173272,7 +172492,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s42, s40, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s41, s41, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s42 -; GFX11-NEXT: s_pack_ll_b32_b16 s31, s41, s104 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s42, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -173297,7 +172516,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s43, s29, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s40, s40, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s30, s40, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s34, s40, s42 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s43, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -173308,7 +172527,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s45, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s43, s43, s44 ; GFX11-NEXT: s_lshl_b32 s29, s29, 16 -; GFX11-NEXT: s_lshr_b32 s54, s43, 16 +; GFX11-NEXT: s_lshr_b32 s77, s43, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s29 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s29, v1 @@ -173322,7 +172541,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s43, s28, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s29, s29, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s43 -; GFX11-NEXT: s_pack_ll_b32_b16 s89, s29, s54 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s43, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -173347,7 +172565,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s44, s15, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s28, s28, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s88, s28, s43 +; GFX11-NEXT: s_pack_ll_b32_b16 s94, s28, s43 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s44, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -173358,10 +172576,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s46, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s44, s44, s45 ; GFX11-NEXT: s_lshl_b32 s15, s15, 16 -; GFX11-NEXT: s_lshr_b32 s72, s44, 16 +; GFX11-NEXT: s_lshr_b32 s93, s44, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s15 -; GFX11-NEXT: v_writelane_b32 v20, s72, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s15, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: s_bfe_u32 s45, s15, 0x10010 @@ -173373,7 +172590,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s44, s14, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s15, s15, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s44 -; GFX11-NEXT: s_pack_ll_b32_b16 s77, s15, s72 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s44, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -173398,7 +172614,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s45, s13, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s14, s14, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s76, s14, s44 +; GFX11-NEXT: s_pack_ll_b32_b16 s92, s14, s44 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s45, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -173409,10 +172625,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s45, s45, s46 ; GFX11-NEXT: s_lshl_b32 s13, s13, 16 -; GFX11-NEXT: s_lshr_b32 s73, s45, 16 +; GFX11-NEXT: s_lshr_b32 s90, s45, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s13 -; GFX11-NEXT: v_writelane_b32 v20, s73, 9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s13, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: s_bfe_u32 s46, s13, 0x10010 @@ -173424,7 +172639,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s45, s12, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s13, s13, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s45 -; GFX11-NEXT: s_pack_ll_b32_b16 s73, s13, s73 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s45, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -173449,7 +172663,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s46, s11, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s12, s12, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s72, s12, s45 +; GFX11-NEXT: s_pack_ll_b32_b16 s78, s12, s45 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s46, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 @@ -173460,10 +172674,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s46, s46, s47 ; GFX11-NEXT: s_lshl_b32 s11, s11, 16 -; GFX11-NEXT: s_lshr_b32 s75, s46, 16 +; GFX11-NEXT: s_lshr_b32 s79, s46, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s11 -; GFX11-NEXT: v_writelane_b32 v20, s75, 10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s11, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: s_bfe_u32 s47, s11, 0x10010 @@ -173509,10 +172722,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s47, s47, s56 ; GFX11-NEXT: s_lshl_b32 s9, s9, 16 -; GFX11-NEXT: s_lshr_b32 s78, s47, 16 +; GFX11-NEXT: s_lshr_b32 s88, s47, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s9 -; GFX11-NEXT: v_writelane_b32 v20, s78, 11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s9, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: s_bfe_u32 s56, s9, 0x10010 @@ -173534,793 +172746,802 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s47, s47, s56 ; GFX11-NEXT: s_lshl_b32 s8, s8, 16 -; GFX11-NEXT: s_lshr_b32 s58, s47, 16 +; GFX11-NEXT: s_lshr_b32 s47, s47, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s8, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-NEXT: s_bfe_u32 s56, s8, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s8 +; GFX11-NEXT: s_add_i32 s56, s56, s8 ; GFX11-NEXT: s_bitset1_b32 s8, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s8, s8, s47 -; GFX11-NEXT: s_and_b32 s47, s7, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s8, s8, s56 +; GFX11-NEXT: s_and_b32 s56, s7, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s8, s8, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s57, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s57, s57, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s57, 0x7fff +; GFX11-NEXT: s_and_b32 s58, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s57 ; GFX11-NEXT: s_lshl_b32 s7, s7, 16 -; GFX11-NEXT: s_lshr_b32 s79, s47, 16 +; GFX11-NEXT: s_lshr_b32 s95, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s7 -; GFX11-NEXT: v_writelane_b32 v20, s79, 12 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s7, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s7, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s7 +; GFX11-NEXT: s_bfe_u32 s57, s7, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s57, s7 ; GFX11-NEXT: s_bitset1_b32 s7, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s7, s7, s47 -; GFX11-NEXT: s_and_b32 s47, s6, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s7, s7, s56 +; GFX11-NEXT: s_and_b32 s56, s6, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s7, s7, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s45, s7, s79 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s57, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s57, s57, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s57, 0x7fff +; GFX11-NEXT: s_and_b32 s58, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s57 ; GFX11-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-NEXT: s_lshr_b32 s59, s47, 16 +; GFX11-NEXT: s_lshr_b32 s57, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s6, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s6, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s6 +; GFX11-NEXT: s_bfe_u32 s58, s6, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s58, s6 ; GFX11-NEXT: s_bitset1_b32 s6, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s6, s6, s47 -; GFX11-NEXT: s_and_b32 s47, s5, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s58, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s6, s6, s56 +; GFX11-NEXT: s_and_b32 s56, s5, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s6, s6, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s58, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s58, s58, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s58, 0x7fff +; GFX11-NEXT: s_and_b32 s59, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s58 ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_lshr_b32 s92, s47, 16 +; GFX11-NEXT: s_lshr_b32 s35, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s5 -; GFX11-NEXT: v_writelane_b32 v20, s92, 13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s5, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s5, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s5 +; GFX11-NEXT: s_bfe_u32 s58, s5, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s58, s5 ; GFX11-NEXT: s_bitset1_b32 s5, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s5, s5, s47 -; GFX11-NEXT: s_and_b32 s47, s4, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s58, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s5, s5, s56 +; GFX11-NEXT: s_and_b32 s56, s4, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s5, s5, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s58, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s58, s58, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s58, 0x7fff +; GFX11-NEXT: s_and_b32 s59, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s58 ; GFX11-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-NEXT: s_lshr_b32 s60, s47, 16 +; GFX11-NEXT: s_lshr_b32 s58, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s4, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s4 +; GFX11-NEXT: s_bfe_u32 s59, s4, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s59, s4 ; GFX11-NEXT: s_bitset1_b32 s4, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s4, s4, s47 -; GFX11-NEXT: s_and_b32 s47, s1, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s59, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s4, s4, s56 +; GFX11-NEXT: s_and_b32 s56, s1, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s4, s4, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s44, s4, s58 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s59, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s59, s59, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s59, 0x7fff +; GFX11-NEXT: s_and_b32 s60, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s59 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_lshr_b32 s93, s47, 16 +; GFX11-NEXT: s_lshr_b32 s89, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s1 -; GFX11-NEXT: v_writelane_b32 v20, s93, 14 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 vcc_hi, s89 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s1, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s1 +; GFX11-NEXT: s_bfe_u32 s59, s1, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s59, s1 ; GFX11-NEXT: s_bitset1_b32 s1, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s1, s1, s47 -; GFX11-NEXT: s_and_b32 s47, s0, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s59, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s1, s1, s56 +; GFX11-NEXT: s_and_b32 s56, s0, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s1, s1, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s95, s1, s93 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s59, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s59, s59, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s59, 0x7fff +; GFX11-NEXT: s_and_b32 s60, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s59 ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: s_lshr_b32 s61, s47, 16 +; GFX11-NEXT: s_lshr_b32 s59, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s0, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s0 +; GFX11-NEXT: s_bfe_u32 s60, s0, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s60, s0 ; GFX11-NEXT: s_bitset1_b32 s0, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s0, s0, s47 -; GFX11-NEXT: s_and_b32 s47, s3, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s60, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s0, s0, s56 +; GFX11-NEXT: s_and_b32 s56, s3, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s94, s0, s61 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s60, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s60, s60, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s60, 0x7fff +; GFX11-NEXT: s_and_b32 s61, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s60 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_lshr_b32 s34, s47, 16 +; GFX11-NEXT: s_lshr_b32 s91, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s3 -; GFX11-NEXT: v_writelane_b32 v20, s34, 15 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s66, s91 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s3, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s3 +; GFX11-NEXT: s_bfe_u32 s60, s3, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s60, s3 ; GFX11-NEXT: s_bitset1_b32 s3, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s3, s3, s47 -; GFX11-NEXT: s_and_b32 s47, s2, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s60, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s3, s3, s56 +; GFX11-NEXT: s_and_b32 s56, s2, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s93, s3, s34 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s31, s3, s91 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s60, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s60, s60, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s60, 0x7fff +; GFX11-NEXT: s_and_b32 s61, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s60 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: s_lshr_b32 s62, s47, 16 +; GFX11-NEXT: s_lshr_b32 s60, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s2, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s2 +; GFX11-NEXT: s_bfe_u32 s61, s2, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s61, s2 ; GFX11-NEXT: s_bitset1_b32 s2, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s2, s2, s47 -; GFX11-NEXT: s_and_b32 s47, s17, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s61, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s2, s2, s56 +; GFX11-NEXT: s_and_b32 s56, s17, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s30, s2, s60 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s61, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s61, s61, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s61, 0x7fff +; GFX11-NEXT: s_and_b32 s62, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s61 ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 -; GFX11-NEXT: s_lshr_b32 s35, s47, 16 +; GFX11-NEXT: s_lshr_b32 s36, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s17 -; GFX11-NEXT: v_writelane_b32 v20, s35, 16 +; GFX11-NEXT: v_writelane_b32 v21, s36, 6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s17, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s17, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s17 +; GFX11-NEXT: s_bfe_u32 s61, s17, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s61, s17 ; GFX11-NEXT: s_bitset1_b32 s17, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s17, s17, s47 -; GFX11-NEXT: s_and_b32 s47, s16, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s61, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s17, s17, s56 +; GFX11-NEXT: s_and_b32 s56, s16, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s17, s17, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s79, s17, s35 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s37, s17, s36 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s61, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s61, s61, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s61, 0x7fff +; GFX11-NEXT: s_and_b32 s62, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s61 ; GFX11-NEXT: s_lshl_b32 s16, s16, 16 -; GFX11-NEXT: s_lshr_b32 s63, s47, 16 +; GFX11-NEXT: s_lshr_b32 s61, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s16, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s16, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s16 +; GFX11-NEXT: s_bfe_u32 s62, s16, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s62, s16 ; GFX11-NEXT: s_bitset1_b32 s16, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s16, s16, s47 -; GFX11-NEXT: s_and_b32 s47, s19, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s62, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s16, s16, s56 +; GFX11-NEXT: s_and_b32 s56, s19, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s16, s16, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s36, s16, s61 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s62, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s62, s62, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s62, 0x7fff +; GFX11-NEXT: s_and_b32 s63, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s62 ; GFX11-NEXT: s_lshl_b32 s19, s19, 16 -; GFX11-NEXT: s_lshr_b32 s36, s47, 16 +; GFX11-NEXT: s_lshr_b32 s38, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s19 -; GFX11-NEXT: v_writelane_b32 v20, s36, 17 +; GFX11-NEXT: v_writelane_b32 v21, s38, 7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s19, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s19, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s19 +; GFX11-NEXT: s_bfe_u32 s62, s19, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s62, s19 ; GFX11-NEXT: s_bitset1_b32 s19, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s19, s19, s47 -; GFX11-NEXT: s_and_b32 s47, s18, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s62, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s19, s19, s56 +; GFX11-NEXT: s_and_b32 s56, s18, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s19, s19, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s53, s19, s38 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s62, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s62, s62, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s62, 0x7fff +; GFX11-NEXT: s_and_b32 s63, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s62 ; GFX11-NEXT: s_lshl_b32 s18, s18, 16 -; GFX11-NEXT: s_lshr_b32 s74, s47, 16 +; GFX11-NEXT: s_lshr_b32 s62, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s18 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s18, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s18, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s18 +; GFX11-NEXT: s_bfe_u32 s63, s18, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s63, s18 ; GFX11-NEXT: s_bitset1_b32 s18, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s18, s18, s47 -; GFX11-NEXT: s_and_b32 s47, s21, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s63, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s18, s18, s56 +; GFX11-NEXT: s_and_b32 s56, s21, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s18, s18, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s74, s18, s74 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s52, s18, s62 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s63, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s63, s63, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s63, 0x7fff +; GFX11-NEXT: s_and_b32 s72, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s63 ; GFX11-NEXT: s_lshl_b32 s21, s21, 16 -; GFX11-NEXT: s_lshr_b32 s37, s47, 16 +; GFX11-NEXT: s_lshr_b32 s39, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s21 -; GFX11-NEXT: v_writelane_b32 v20, s37, 18 +; GFX11-NEXT: v_writelane_b32 v21, s39, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s21, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s21, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s21 +; GFX11-NEXT: s_bfe_u32 s63, s21, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s63, s21 ; GFX11-NEXT: s_bitset1_b32 s21, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s21, s21, s47 -; GFX11-NEXT: s_and_b32 s47, s20, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s63, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s21, s21, s56 +; GFX11-NEXT: s_and_b32 s56, s20, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s21, s21, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s51, s21, s39 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s63, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s63, s63, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s63, 0x7fff +; GFX11-NEXT: s_and_b32 s72, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s63 ; GFX11-NEXT: s_lshl_b32 s20, s20, 16 -; GFX11-NEXT: s_lshr_b32 s90, s47, 16 +; GFX11-NEXT: s_lshr_b32 s63, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s20 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s20, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s20, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s20 +; GFX11-NEXT: s_bfe_u32 s72, s20, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s72, s20 ; GFX11-NEXT: s_bitset1_b32 s20, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s20, s20, s47 -; GFX11-NEXT: s_and_b32 s47, s23, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s72, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s20, s20, s56 +; GFX11-NEXT: s_and_b32 s56, s23, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s20, s20, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s50, s20, s63 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s72, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s72, s72, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s72, 0x7fff +; GFX11-NEXT: s_and_b32 s73, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s72 ; GFX11-NEXT: s_lshl_b32 s23, s23, 16 -; GFX11-NEXT: s_lshr_b32 s38, s47, 16 +; GFX11-NEXT: s_lshr_b32 s48, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s23 -; GFX11-NEXT: v_writelane_b32 v20, s38, 19 +; GFX11-NEXT: v_writelane_b32 v21, s48, 9 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s23, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s23, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s23 +; GFX11-NEXT: s_bfe_u32 s72, s23, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s72, s23 ; GFX11-NEXT: s_bitset1_b32 s23, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s23, s23, s47 -; GFX11-NEXT: s_and_b32 s47, s22, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s72, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s23, s23, s56 +; GFX11-NEXT: s_and_b32 s56, s22, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s23, s23, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s35, s23, s38 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s39, s23, s48 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s72, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s72, s72, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s72, 0x7fff +; GFX11-NEXT: s_and_b32 s73, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s72 ; GFX11-NEXT: s_lshl_b32 s22, s22, 16 -; GFX11-NEXT: s_lshr_b32 s91, s47, 16 +; GFX11-NEXT: s_lshr_b32 s73, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s22 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s22, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s22, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s22 +; GFX11-NEXT: s_bfe_u32 s72, s22, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s72, s22 ; GFX11-NEXT: s_bitset1_b32 s22, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s22, s22, s47 -; GFX11-NEXT: s_and_b32 s47, s25, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s72, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s22, s22, s56 +; GFX11-NEXT: s_and_b32 s56, s25, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s22, s22, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s34, s22, s91 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s38, s22, s73 +; GFX11-NEXT: s_pack_ll_b32_b16 s73, s11, s79 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s72, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s72, s72, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s72, 0x7fff +; GFX11-NEXT: s_and_b32 s74, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s72 ; GFX11-NEXT: s_lshl_b32 s25, s25, 16 -; GFX11-NEXT: s_lshr_b32 s39, s47, 16 +; GFX11-NEXT: s_lshr_b32 s67, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s25 -; GFX11-NEXT: v_writelane_b32 v20, s39, 20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s25, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s25, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s25 +; GFX11-NEXT: s_bfe_u32 s72, s25, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s72, s25 ; GFX11-NEXT: s_bitset1_b32 s25, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s25, s25, s47 -; GFX11-NEXT: s_and_b32 s47, s24, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s72, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s25, s25, s56 +; GFX11-NEXT: s_and_b32 s56, s24, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s25, s25, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s49, s25, s67 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s47, v1 +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s47, 0x10010 -; GFX11-NEXT: s_add_i32 s56, s56, s47 -; GFX11-NEXT: s_bitset1_b32 s47, 22 -; GFX11-NEXT: s_addk_i32 s56, 0x7fff -; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s47, s47, s56 +; GFX11-NEXT: s_bfe_u32 s72, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s72, s72, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s72, 0x7fff +; GFX11-NEXT: s_and_b32 s74, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s56, s56, s72 ; GFX11-NEXT: s_lshl_b32 s24, s24, 16 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s47, 16 +; GFX11-NEXT: s_lshr_b32 s74, s56, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s24 -; GFX11-NEXT: s_pack_ll_b32_b16 s57, s11, s75 -; GFX11-NEXT: s_pack_ll_b32_b16 s75, s19, s36 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s24, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s56, s24, 0x10010 -; GFX11-NEXT: s_add_i32 s47, s56, s24 +; GFX11-NEXT: s_bfe_u32 s72, s24, 0x10010 +; GFX11-NEXT: s_add_i32 s56, s72, s24 ; GFX11-NEXT: s_bitset1_b32 s24, 22 -; GFX11-NEXT: s_addk_i32 s47, 0x7fff -; GFX11-NEXT: s_and_b32 s56, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s24, s24, s47 -; GFX11-NEXT: s_and_b32 s47, s27, 0xffff0000 +; GFX11-NEXT: s_addk_i32 s56, 0x7fff +; GFX11-NEXT: s_and_b32 s72, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s24, s24, s56 +; GFX11-NEXT: s_and_b32 s56, s27, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s24, s24, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s47 -; GFX11-NEXT: s_pack_ll_b32_b16 s56, s10, s46 -; GFX11-NEXT: s_pack_ll_b32_b16 s46, s8, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s47, s9, s78 -; GFX11-NEXT: s_pack_ll_b32_b16 s78, s16, s63 -; GFX11-NEXT: v_readfirstlane_b32 s42, v1 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s56 +; GFX11-NEXT: s_pack_ll_b32_b16 s48, s24, s74 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s56, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_pack_ll_b32_b16 s63, s21, s37 -; GFX11-NEXT: s_pack_ll_b32_b16 s37, s25, s39 -; GFX11-NEXT: s_pack_ll_b32_b16 s36, s24, vcc_hi -; GFX11-NEXT: s_bfe_u32 s43, s42, 0x10010 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s43, s43, s42 -; GFX11-NEXT: s_bitset1_b32 s42, 22 -; GFX11-NEXT: s_addk_i32 s43, 0x7fff -; GFX11-NEXT: s_and_b32 s44, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s42, s42, s43 +; GFX11-NEXT: s_bfe_u32 s72, s56, 0x10010 +; GFX11-NEXT: s_add_i32 s72, s72, s56 +; GFX11-NEXT: s_bitset1_b32 s56, 22 +; GFX11-NEXT: s_addk_i32 s72, 0x7fff +; GFX11-NEXT: s_and_b32 s75, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s75, s56, s72 ; GFX11-NEXT: s_lshl_b32 s27, s27, 16 -; GFX11-NEXT: s_lshr_b32 s71, s42, 16 +; GFX11-NEXT: s_lshr_b32 s68, s75, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s27 -; GFX11-NEXT: s_pack_ll_b32_b16 s44, s6, s59 -; GFX11-NEXT: s_pack_ll_b32_b16 s43, s5, s92 -; GFX11-NEXT: s_pack_ll_b32_b16 s92, s2, s62 -; GFX11-NEXT: s_pack_ll_b32_b16 s62, s20, s90 +; GFX11-NEXT: s_pack_ll_b32_b16 s56, s8, s47 +; GFX11-NEXT: s_pack_ll_b32_b16 s72, s10, s46 +; GFX11-NEXT: s_pack_ll_b32_b16 s46, s6, s57 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s27, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s58, s27, 0x10010 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s42, s58, s27 +; GFX11-NEXT: s_bfe_u32 s42, s27, 0x10010 +; GFX11-NEXT: s_add_i32 s42, s42, s27 ; GFX11-NEXT: s_bitset1_b32 s27, 22 ; GFX11-NEXT: s_addk_i32 s42, 0x7fff -; GFX11-NEXT: s_and_b32 s58, vcc_lo, exec_lo +; GFX11-NEXT: s_and_b32 s43, vcc_lo, exec_lo ; GFX11-NEXT: s_cselect_b32 s27, s27, s42 -; GFX11-NEXT: s_and_b32 s58, s26, 0xffff0000 +; GFX11-NEXT: s_and_b32 s42, s26, 0xffff0000 ; GFX11-NEXT: s_lshr_b32 s27, s27, 16 -; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s58 -; GFX11-NEXT: s_pack_ll_b32_b16 s42, s4, s60 -; GFX11-NEXT: s_pack_ll_b32_b16 s91, s27, s71 +; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s42 +; GFX11-NEXT: s_pack_ll_b32_b16 s43, s1, s89 +; GFX11-NEXT: s_pack_ll_b32_b16 s42, s0, s59 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s58, v1 +; GFX11-NEXT: v_readfirstlane_b32 s45, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s59, s58, 0x10010 -; GFX11-NEXT: s_add_i32 s59, s59, s58 -; GFX11-NEXT: s_bitset1_b32 s58, 22 -; GFX11-NEXT: s_addk_i32 s59, 0x7fff -; GFX11-NEXT: s_and_b32 s60, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s58, s58, s59 +; GFX11-NEXT: s_bfe_u32 s47, s45, 0x10010 +; GFX11-NEXT: s_add_i32 s47, s47, s45 +; GFX11-NEXT: s_bitset1_b32 s45, 22 +; GFX11-NEXT: s_addk_i32 s47, 0x7fff +; GFX11-NEXT: s_and_b32 s57, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s45, s45, s47 ; GFX11-NEXT: s_lshl_b32 s26, s26, 16 -; GFX11-NEXT: s_lshr_b32 s90, s58, 16 +; GFX11-NEXT: s_lshr_b32 s70, s45, 16 ; GFX11-NEXT: v_add_f32_e64 v1, 0x40c00000, s26 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_readfirstlane_b32 s26, v1 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: s_bfe_u32 s59, s26, 0x10010 -; GFX11-NEXT: s_add_i32 s58, s59, s26 +; GFX11-NEXT: s_bfe_u32 s47, s26, 0x10010 +; GFX11-NEXT: s_add_i32 s45, s47, s26 ; GFX11-NEXT: s_bitset1_b32 s26, 22 -; GFX11-NEXT: s_addk_i32 s58, 0x7fff -; GFX11-NEXT: s_and_b32 s59, vcc_lo, exec_lo -; GFX11-NEXT: s_cselect_b32 s26, s26, s58 -; GFX11-NEXT: s_lshr_b64 s[58:59], s[36:37], 24 -; GFX11-NEXT: s_lshr_b32 s50, s62, 16 -; GFX11-NEXT: v_writelane_b32 v20, s58, 2 -; GFX11-NEXT: s_lshr_b32 s81, s62, 8 -; GFX11-NEXT: s_lshr_b32 s85, s75, 24 -; GFX11-NEXT: s_lshr_b32 s60, s75, 8 -; GFX11-NEXT: s_lshr_b32 s61, s74, 16 -; GFX11-NEXT: v_writelane_b32 v20, s59, 3 -; GFX11-NEXT: s_lshr_b64 s[58:59], s[34:35], 24 -; GFX11-NEXT: s_lshr_b32 s96, s74, 8 -; GFX11-NEXT: s_lshr_b64 s[74:75], s[74:75], 24 -; GFX11-NEXT: s_lshr_b32 s75, s42, 8 -; GFX11-NEXT: v_writelane_b32 v20, s58, 0 -; GFX11-NEXT: s_lshr_b32 s58, s63, 24 +; GFX11-NEXT: s_addk_i32 s45, 0x7fff +; GFX11-NEXT: s_and_b32 s47, vcc_lo, exec_lo +; GFX11-NEXT: s_cselect_b32 s26, s26, s45 +; GFX11-NEXT: s_lshr_b32 s45, s49, 24 +; GFX11-NEXT: s_lshr_b64 s[60:61], s[38:39], 24 +; GFX11-NEXT: v_writelane_b32 v21, s45, 10 +; GFX11-NEXT: s_lshr_b32 s45, s49, 8 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[50:51], 24 +; GFX11-NEXT: s_mov_b32 s61, s76 +; GFX11-NEXT: s_mov_b32 s63, s77 +; GFX11-NEXT: v_writelane_b32 v21, s45, 11 +; GFX11-NEXT: s_lshr_b32 s45, s48, 16 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[36:37], 24 +; GFX11-NEXT: s_mov_b32 s77, s88 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[30:31], 24 +; GFX11-NEXT: v_writelane_b32 v21, s45, 12 +; GFX11-NEXT: s_lshr_b32 s45, s48, 8 +; GFX11-NEXT: s_mov_b32 s89, s90 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[42:43], 24 +; GFX11-NEXT: s_lshr_b32 s101, s43, 24 +; GFX11-NEXT: v_writelane_b32 v21, s45, 13 +; GFX11-NEXT: s_lshr_b32 s45, s39, 24 +; GFX11-NEXT: s_lshr_b32 s102, s43, 8 +; GFX11-NEXT: s_lshr_b32 s103, s42, 16 +; GFX11-NEXT: s_lshr_b32 s104, s42, 8 +; GFX11-NEXT: v_writelane_b32 v21, s45, 14 +; GFX11-NEXT: s_lshr_b32 s45, s39, 8 +; GFX11-NEXT: s_pack_ll_b32_b16 s47, s7, s95 +; GFX11-NEXT: s_pack_ll_b32_b16 s57, s9, s77 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[52:53], 24 +; GFX11-NEXT: v_writelane_b32 v21, s45, 15 +; GFX11-NEXT: s_lshr_b32 s45, s38, 16 +; GFX11-NEXT: s_mov_b32 s75, s79 +; GFX11-NEXT: s_pack_ll_b32_b16 s79, s13, s89 ; GFX11-NEXT: s_lshr_b32 s26, s26, 16 -; GFX11-NEXT: s_lshr_b32 s65, s73, 24 -; GFX11-NEXT: s_pack_ll_b32_b16 s90, s26, s90 -; GFX11-NEXT: v_writelane_b32 v20, s59, 1 -; GFX11-NEXT: s_lshr_b32 s59, s63, 8 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[62:63], 24 -; GFX11-NEXT: s_lshr_b32 s63, s93, 24 -; GFX11-NEXT: s_lshr_b32 s82, s73, 8 -; GFX11-NEXT: v_writelane_b32 v20, s63, 21 -; GFX11-NEXT: s_lshr_b32 s63, s93, 8 -; GFX11-NEXT: s_lshr_b32 s84, s72, 16 -; GFX11-NEXT: s_lshr_b32 s51, s72, 8 -; GFX11-NEXT: s_lshr_b64 s[72:73], s[72:73], 24 -; GFX11-NEXT: v_writelane_b32 v20, s63, 22 -; GFX11-NEXT: s_lshr_b32 s63, s92, 16 -; GFX11-NEXT: s_lshr_b32 s86, s77, 24 -; GFX11-NEXT: s_lshr_b32 s87, s77, 8 -; GFX11-NEXT: s_lshr_b32 s52, s76, 16 -; GFX11-NEXT: v_writelane_b32 v20, s63, 23 -; GFX11-NEXT: s_lshr_b32 s63, s92, 8 -; GFX11-NEXT: s_lshr_b32 s100, s76, 8 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[76:77], 24 -; GFX11-NEXT: s_lshr_b32 s101, s89, 8 -; GFX11-NEXT: v_writelane_b32 v20, s63, 24 -; GFX11-NEXT: s_lshr_b32 s63, s95, 24 -; GFX11-NEXT: s_lshr_b32 s98, s79, 24 -; GFX11-NEXT: s_lshr_b32 s99, s79, 8 -; GFX11-NEXT: s_lshr_b32 s53, s78, 16 -; GFX11-NEXT: v_writelane_b32 v20, s63, 25 -; GFX11-NEXT: s_lshr_b32 s63, s95, 8 -; GFX11-NEXT: s_lshr_b32 s97, s78, 8 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[78:79], 24 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[92:93], 24 -; GFX11-NEXT: v_writelane_b32 v20, s63, 26 -; GFX11-NEXT: s_lshr_b32 s63, s43, 24 -; GFX11-NEXT: s_lshr_b32 s102, s94, 16 -; GFX11-NEXT: s_lshr_b32 s103, s94, 8 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[94:95], 24 -; GFX11-NEXT: v_writelane_b32 v20, s63, 27 -; GFX11-NEXT: s_lshr_b32 s63, s43, 8 -; GFX11-NEXT: s_lshr_b32 s73, s91, 24 -; GFX11-NEXT: s_lshr_b32 s77, s91, 8 -; GFX11-NEXT: s_lshr_b32 s83, s90, 8 -; GFX11-NEXT: v_writelane_b32 v20, s63, 28 -; GFX11-NEXT: s_lshr_b32 s63, s42, 16 -; GFX11-NEXT: s_lshr_b64 s[42:43], s[42:43], 24 -; GFX11-NEXT: s_lshr_b32 s66, s37, 24 -; GFX11-NEXT: s_lshr_b32 s67, s37, 8 -; GFX11-NEXT: v_writelane_b32 v20, s42, 6 -; GFX11-NEXT: s_lshr_b32 s68, s36, 16 -; GFX11-NEXT: s_lshr_b32 s49, s36, 8 -; GFX11-NEXT: s_lshr_b32 s69, s35, 24 -; GFX11-NEXT: s_lshr_b32 s70, s35, 8 -; GFX11-NEXT: v_writelane_b32 v20, s43, 7 +; GFX11-NEXT: v_writelane_b32 v21, s45, 16 +; GFX11-NEXT: s_pack_ll_b32_b16 s45, s5, s35 +; GFX11-NEXT: s_mov_b32 s91, s95 ; GFX11-NEXT: s_lshr_b64 s[42:43], s[44:45], 24 -; GFX11-NEXT: s_lshr_b32 s64, s34, 16 -; GFX11-NEXT: s_lshr_b32 s80, s34, 8 -; GFX11-NEXT: s_lshr_b32 s79, s45, 24 -; GFX11-NEXT: v_writelane_b32 v20, s42, 4 -; GFX11-NEXT: s_lshr_b32 s42, s89, 24 -; GFX11-NEXT: s_lshr_b32 s93, s45, 8 -; GFX11-NEXT: s_lshr_b32 s95, s44, 16 -; GFX11-NEXT: s_lshr_b32 vcc_hi, s44, 8 -; GFX11-NEXT: v_writelane_b32 v20, s43, 5 -; GFX11-NEXT: s_lshr_b32 s43, s88, 16 -; GFX11-NEXT: s_lshr_b32 s34, s47, 24 -; GFX11-NEXT: s_lshr_b32 s55, s47, 8 -; GFX11-NEXT: s_lshr_b32 s35, s46, 16 -; GFX11-NEXT: v_writelane_b32 v20, s43, 29 -; GFX11-NEXT: s_lshr_b32 s43, s88, 8 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[88:89], 24 -; GFX11-NEXT: s_lshr_b32 s89, s90, 16 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[90:91], 24 -; GFX11-NEXT: v_writelane_b32 v20, s43, 30 -; GFX11-NEXT: s_lshr_b32 s36, s46, 8 -; GFX11-NEXT: s_lshr_b64 s[46:47], s[46:47], 24 -; GFX11-NEXT: s_lshr_b32 s37, s57, 24 -; GFX11-NEXT: s_lshr_b32 s38, s57, 8 -; GFX11-NEXT: s_lshr_b32 s39, s56, 16 -; GFX11-NEXT: s_lshr_b32 s48, s56, 8 -; GFX11-NEXT: s_lshr_b64 s[56:57], s[56:57], 24 -; GFX11-NEXT: s_lshr_b32 s91, s31, 24 -; GFX11-NEXT: s_lshr_b32 s43, s31, 8 -; GFX11-NEXT: s_lshr_b32 s44, s30, 16 -; GFX11-NEXT: s_lshr_b32 s45, s30, 8 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[30:31], 24 -; GFX11-NEXT: v_writelane_b32 v20, s43, 31 -; GFX11-NEXT: .LBB91_5: ; %end -; GFX11-NEXT: s_lshl_b32 s47, s103, 8 +; GFX11-NEXT: s_pack_ll_b32_b16 s95, s29, s63 +; GFX11-NEXT: v_writelane_b32 v21, s35, 17 +; GFX11-NEXT: s_pack_ll_b32_b16 s35, s41, s61 +; GFX11-NEXT: s_lshr_b64 s[58:59], s[48:49], 24 +; GFX11-NEXT: s_lshr_b32 s38, s38, 8 +; GFX11-NEXT: s_lshr_b32 s39, s51, 24 +; GFX11-NEXT: v_writelane_b32 v21, s42, 4 +; GFX11-NEXT: s_lshr_b32 s42, s45, 24 +; GFX11-NEXT: s_lshr_b32 s48, s51, 8 +; GFX11-NEXT: s_lshr_b32 s49, s50, 16 +; GFX11-NEXT: s_lshr_b32 s50, s50, 8 +; GFX11-NEXT: v_writelane_b32 v21, s43, 5 +; GFX11-NEXT: s_lshr_b32 s69, s53, 24 +; GFX11-NEXT: s_lshr_b32 s71, s53, 8 +; GFX11-NEXT: s_lshr_b32 s81, s52, 16 +; GFX11-NEXT: s_lshr_b32 s82, s52, 8 +; GFX11-NEXT: v_writelane_b32 v21, s42, 18 +; GFX11-NEXT: s_lshr_b32 s42, s45, 8 +; GFX11-NEXT: s_lshr_b32 s54, s37, 24 +; GFX11-NEXT: s_lshr_b32 s85, s37, 8 +; GFX11-NEXT: s_lshr_b32 s87, s36, 16 +; GFX11-NEXT: v_writelane_b32 v21, s42, 19 +; GFX11-NEXT: s_lshr_b32 s42, s44, 16 +; GFX11-NEXT: s_lshr_b32 s55, s36, 8 +; GFX11-NEXT: s_lshr_b32 s97, s31, 24 +; GFX11-NEXT: s_lshr_b32 s99, s31, 8 +; GFX11-NEXT: v_writelane_b32 v21, s42, 20 +; GFX11-NEXT: s_lshr_b32 s42, s44, 8 +; GFX11-NEXT: s_lshr_b32 s100, s30, 16 +; GFX11-NEXT: s_lshr_b32 s64, s30, 8 +; GFX11-NEXT: s_lshr_b32 s51, s79, 24 +; GFX11-NEXT: v_writelane_b32 v21, s42, 21 +; GFX11-NEXT: s_lshr_b64 s[42:43], s[46:47], 24 +; GFX11-NEXT: s_lshr_b32 s36, s79, 8 +; GFX11-NEXT: s_lshr_b32 s53, s78, 16 +; GFX11-NEXT: s_lshr_b32 s80, s78, 8 +; GFX11-NEXT: v_writelane_b32 v21, s42, 2 +; GFX11-NEXT: s_lshr_b32 s42, s47, 24 +; GFX11-NEXT: s_lshr_b32 s86, s92, 16 +; GFX11-NEXT: s_lshr_b32 s96, s92, 8 +; GFX11-NEXT: s_lshr_b32 s98, s95, 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 3 +; GFX11-NEXT: s_lshr_b32 s44, s94, 8 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[34:35], 24 +; GFX11-NEXT: s_lshr_b32 s45, s35, 24 +; GFX11-NEXT: s_lshr_b32 s65, s35, 8 +; GFX11-NEXT: v_writelane_b32 v21, s42, 22 +; GFX11-NEXT: s_lshr_b32 s42, s47, 8 +; GFX11-NEXT: s_pack_ll_b32_b16 s47, s27, s68 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s35, s47, 24 +; GFX11-NEXT: v_writelane_b32 v21, s42, 23 +; GFX11-NEXT: s_lshr_b32 s42, s46, 16 +; GFX11-NEXT: s_lshr_b32 s52, s47, 8 +; GFX11-NEXT: v_writelane_b32 v21, s42, 24 +; GFX11-NEXT: s_lshr_b32 s42, s46, 8 +; GFX11-NEXT: s_pack_ll_b32_b16 s46, s26, s70 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s37, s46, 16 +; GFX11-NEXT: v_writelane_b32 v21, s42, 25 +; GFX11-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 +; GFX11-NEXT: s_lshr_b32 s70, s46, 8 +; GFX11-NEXT: v_writelane_b32 v21, s42, 0 +; GFX11-NEXT: s_lshr_b32 s42, s57, 24 +; GFX11-NEXT: v_writelane_b32 v21, s43, 1 +; GFX11-NEXT: s_lshr_b32 s43, s94, 16 +; GFX11-NEXT: v_writelane_b32 v21, s42, 26 +; GFX11-NEXT: s_lshr_b32 s42, s57, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s42, 27 +; GFX11-NEXT: s_lshr_b32 s42, s56, 16 +; GFX11-NEXT: v_writelane_b32 v21, s42, 28 +; GFX11-NEXT: s_lshr_b32 s42, s56, 8 +; GFX11-NEXT: s_lshr_b64 s[56:57], s[72:73], 24 +; GFX11-NEXT: v_writelane_b32 v21, s42, 29 +; GFX11-NEXT: s_lshr_b32 s42, s73, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v21, s42, 30 +; GFX11-NEXT: s_lshr_b32 s42, s73, 8 +; GFX11-NEXT: v_writelane_b32 v21, s42, 31 +; GFX11-NEXT: s_lshr_b32 s42, s72, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v20, s42, 0 +; GFX11-NEXT: s_lshr_b32 s42, s72, 8 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[78:79], 24 +; GFX11-NEXT: s_mov_b32 s73, s93 +; GFX11-NEXT: s_pack_ll_b32_b16 s93, s15, s93 +; GFX11-NEXT: v_writelane_b32 v20, s42, 1 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[92:93], 24 +; GFX11-NEXT: s_lshr_b32 s83, s93, 24 +; GFX11-NEXT: s_lshr_b32 s84, s93, 8 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[94:95], 24 +; GFX11-NEXT: s_lshr_b32 s42, s95, 8 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[46:47], 24 +; GFX11-NEXT: s_lshr_b32 s46, s34, 16 +; GFX11-NEXT: s_lshr_b32 s47, s34, 8 +; GFX11-NEXT: .LBB91_3: ; %end ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_and_b32 s57, s102, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s47 -; GFX11-NEXT: s_lshl_b32 s47, s94, 8 -; GFX11-NEXT: v_readlane_b32 s43, v20, 26 -; GFX11-NEXT: s_or_b32 s47, s57, s47 +; GFX11-NEXT: s_lshl_b32 s57, s104, 8 +; GFX11-NEXT: s_lshl_b32 s59, s90, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s57 +; GFX11-NEXT: s_and_b32 s57, s103, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s47, s47, 16 +; GFX11-NEXT: s_or_b32 s57, s57, s59 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s47 -; GFX11-NEXT: s_lshl_b32 s47, s43, 8 -; GFX11-NEXT: v_readlane_b32 s43, v20, 25 -; GFX11-NEXT: s_or_b32 s1, s1, s47 -; GFX11-NEXT: v_readlane_b32 s47, v20, 14 -; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshl_b32 s57, s57, 16 +; GFX11-NEXT: s_lshl_b32 s59, s101, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s57 +; GFX11-NEXT: s_lshl_b32 s57, s102, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s57, s43, 8 -; GFX11-NEXT: v_readlane_b32 s43, v20, 24 -; GFX11-NEXT: s_and_b32 s47, s47, 0xff +; GFX11-NEXT: s_or_b32 s1, s1, s57 +; GFX11-NEXT: s_and_b32 s57, vcc_hi, 0xff +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_or_b32 s57, s57, s59 +; GFX11-NEXT: s_lshl_b32 s59, s88, 8 +; GFX11-NEXT: s_lshl_b32 s57, s57, 16 ; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_or_b32 s47, s47, s57 -; GFX11-NEXT: v_readlane_b32 s103, v19, 7 -; GFX11-NEXT: s_lshl_b32 s47, s47, 16 -; GFX11-NEXT: v_readlane_b32 s102, v19, 6 -; GFX11-NEXT: s_or_b32 s1, s1, s47 -; GFX11-NEXT: s_lshl_b32 s47, s43, 8 -; GFX11-NEXT: v_readlane_b32 s43, v20, 23 -; GFX11-NEXT: s_or_b32 s2, s2, s47 -; GFX11-NEXT: s_lshl_b32 s47, s92, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_or_b32 s1, s1, s57 +; GFX11-NEXT: s_lshl_b32 s57, s64, 8 ; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: s_and_b32 s57, s43, 0xff -; GFX11-NEXT: v_readlane_b32 s43, v20, 22 -; GFX11-NEXT: s_or_b32 s47, s57, s47 -; GFX11-NEXT: s_lshl_b32 s0, s97, 8 -; GFX11-NEXT: s_lshl_b32 s47, s47, 16 -; GFX11-NEXT: s_and_b32 s1, s16, 0xff -; GFX11-NEXT: s_or_b32 s2, s2, s47 -; GFX11-NEXT: s_lshl_b32 s47, s43, 8 -; GFX11-NEXT: v_readlane_b32 s43, v20, 21 -; GFX11-NEXT: s_or_b32 s3, s3, s47 -; GFX11-NEXT: v_readlane_b32 s47, v20, 15 +; GFX11-NEXT: s_or_b32 s2, s2, s57 +; GFX11-NEXT: s_and_b32 s57, s100, 0xff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_or_b32 s57, s57, s59 +; GFX11-NEXT: s_lshl_b32 s59, s97, 8 +; GFX11-NEXT: s_lshl_b32 s57, s57, 16 +; GFX11-NEXT: s_and_b32 s0, s16, 0xff +; GFX11-NEXT: s_or_b32 s2, s2, s57 +; GFX11-NEXT: s_lshl_b32 s57, s99, 8 +; GFX11-NEXT: s_lshl_b32 s1, s55, 8 +; GFX11-NEXT: s_or_b32 s3, s3, s57 +; GFX11-NEXT: s_and_b32 s57, s66, 0xff ; GFX11-NEXT: s_and_b32 s3, s3, 0xffff -; GFX11-NEXT: s_or_b32 s0, s1, s0 -; GFX11-NEXT: s_lshl_b32 s57, s43, 8 -; GFX11-NEXT: s_lshl_b32 s1, s78, 8 -; GFX11-NEXT: s_and_b32 s47, s47, 0xff -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_or_b32 s47, s47, s57 -; GFX11-NEXT: s_and_b32 s16, s61, 0xff -; GFX11-NEXT: s_lshl_b32 s47, s47, 16 -; GFX11-NEXT: v_readlane_b32 s97, v19, 1 -; GFX11-NEXT: s_or_b32 s3, s3, s47 +; GFX11-NEXT: s_or_b32 s57, s57, s59 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s57, s57, 16 +; GFX11-NEXT: s_and_b32 s1, s87, 0xff +; GFX11-NEXT: s_or_b32 s3, s3, s57 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: s_and_b32 s2, s53, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s98, 8 -; GFX11-NEXT: s_or_b32 s1, s2, s1 -; GFX11-NEXT: s_lshl_b32 s2, s99, 8 +; GFX11-NEXT: s_lshl_b32 s2, s76, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_lshl_b32 s2, s85, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: v_readlane_b32 s99, v19, 3 +; GFX11-NEXT: s_lshl_b32 s3, s54, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s17, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s85, 8 +; GFX11-NEXT: s_lshl_b32 s16, s74, 8 ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v20, 16 +; GFX11-NEXT: v_readlane_b32 s2, v21, 6 ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: v_readlane_b32 s98, v19, 2 -; GFX11-NEXT: v_readlane_b32 s85, v18, 29 -; GFX11-NEXT: v_readlane_b32 s53, v18, 13 +; GFX11-NEXT: s_lshl_b32 s17, s69, 8 +; GFX11-NEXT: v_readlane_b32 s104, v19, 8 +; GFX11-NEXT: v_readlane_b32 s103, v19, 7 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: v_readlane_b32 s31, v18, 1 +; GFX11-NEXT: v_readlane_b32 s102, v19, 6 ; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s18, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s82, 8 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: s_lshl_b32 s18, s69, 8 +; GFX11-NEXT: v_readlane_b32 s101, v19, 5 ; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: s_lshl_b32 s2, s96, 8 +; GFX11-NEXT: s_and_b32 s2, s18, 0xff ; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_lshl_b32 s3, s74, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s3, s81, 0xff ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s3, s16, s3 -; GFX11-NEXT: s_lshl_b32 s16, s60, 8 +; GFX11-NEXT: s_or_b32 s3, s3, s16 +; GFX11-NEXT: s_lshl_b32 s16, s71, 8 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_lshl_b32 s0, s81, 8 +; GFX11-NEXT: s_and_b32 s0, s20, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_and_b32 s3, s19, 0xff -; GFX11-NEXT: s_and_b32 s1, s20, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s50, 8 ; GFX11-NEXT: s_or_b32 s3, s3, s16 -; GFX11-NEXT: v_readlane_b32 s16, v20, 17 +; GFX11-NEXT: v_readlane_b32 s16, v21, 7 ; GFX11-NEXT: s_and_b32 s3, s3, 0xffff -; GFX11-NEXT: s_or_b32 s0, s1, s0 -; GFX11-NEXT: s_lshl_b32 s1, s62, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_and_b32 s1, s49, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_and_b32 s16, s16, 0xff -; GFX11-NEXT: s_lshl_b32 s19, s73, 8 +; GFX11-NEXT: v_readlane_b32 s18, v21, 9 ; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s17, v20, 1 +; GFX11-NEXT: s_lshl_b32 s17, s60, 8 ; GFX11-NEXT: s_lshl_b32 s16, s16, 16 -; GFX11-NEXT: s_lshl_b32 s17, s70, 8 +; GFX11-NEXT: v_readlane_b32 s19, v21, 14 ; GFX11-NEXT: s_or_b32 s3, s3, s16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: s_and_b32 s2, s50, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s58, 8 -; GFX11-NEXT: s_or_b32 s1, s2, s1 -; GFX11-NEXT: s_lshl_b32 s2, s59, 8 +; GFX11-NEXT: s_lshl_b32 s2, s62, 8 +; GFX11-NEXT: s_lshl_b32 s3, s48, 8 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_and_b32 s2, s21, 0xff +; GFX11-NEXT: s_lshl_b32 s16, s39, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: v_readlane_b32 s3, v21, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: v_readlane_b32 s16, v20, 0 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_and_b32 s1, s21, 0xff +; GFX11-NEXT: s_and_b32 s18, s18, 0xff +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s19, s19, 8 +; GFX11-NEXT: s_or_b32 s3, s3, s16 +; GFX11-NEXT: v_readlane_b32 s16, v21, 16 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v20, 18 -; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: v_readlane_b32 s96, v19, 0 -; GFX11-NEXT: v_readlane_b32 s81, v18, 25 -; GFX11-NEXT: v_readlane_b32 s70, v18, 22 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: v_readlane_b32 s69, v18, 21 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: s_and_b32 s2, s22, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s38, 8 +; GFX11-NEXT: s_and_b32 s16, s16, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s22, 0xff -; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: v_readlane_b32 s50, v18, 10 -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: s_lshl_b32 s2, s80, 8 -; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 -; GFX11-NEXT: s_or_b32 s2, s3, s2 -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 -; GFX11-NEXT: s_and_b32 s16, s64, 0xff -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s3, s16, s3 +; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: v_readlane_b32 s17, v21, 15 ; GFX11-NEXT: s_and_b32 s16, s23, 0xff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s17, v20, 19 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: s_lshl_b32 s17, s17, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s16, 0xffff +; GFX11-NEXT: s_or_b32 s16, s16, s17 +; GFX11-NEXT: s_or_b32 s17, s18, s19 +; GFX11-NEXT: s_and_b32 s16, s16, 0xffff +; GFX11-NEXT: s_lshl_b32 s17, s17, 16 +; GFX11-NEXT: v_readlane_b32 s1, v21, 13 +; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-NEXT: v_readlane_b32 s2, v21, 12 ; GFX11-NEXT: s_and_b32 s0, s24, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s49, 8 -; GFX11-NEXT: s_and_b32 s17, s17, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_lshl_b32 s3, s58, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s17, s17, s18 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s16, s17, 16 -; GFX11-NEXT: v_readlane_b32 s17, v20, 3 -; GFX11-NEXT: s_or_b32 s3, s3, s16 -; GFX11-NEXT: v_readlane_b32 s16, v20, 2 -; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 -; GFX11-NEXT: s_and_b32 s2, s68, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s66, 8 -; GFX11-NEXT: s_lshl_b32 s3, s16, 8 -; GFX11-NEXT: v_readlane_b32 s16, v20, 20 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: v_readlane_b32 s17, v21, 10 ; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: v_readlane_b32 s3, v21, 11 ; GFX11-NEXT: s_and_b32 s2, s25, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s67, 8 +; GFX11-NEXT: s_and_b32 s16, s67, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s17, 8 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff @@ -174328,14 +173549,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s26, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s83, 8 -; GFX11-NEXT: s_and_b32 s16, s89, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s90, 8 +; GFX11-NEXT: s_lshl_b32 s3, s70, 8 +; GFX11-NEXT: s_and_b32 s16, s37, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s94, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 ; GFX11-NEXT: s_and_b32 s16, s27, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s77, 8 -; GFX11-NEXT: s_and_b32 s18, s71, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s52, 8 +; GFX11-NEXT: s_and_b32 s18, s68, 0xff +; GFX11-NEXT: s_lshl_b32 s19, s35, 8 ; GFX11-NEXT: s_or_b32 s16, s16, s17 ; GFX11-NEXT: s_or_b32 s17, s18, s19 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff @@ -174344,61 +173566,56 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 -; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 ; GFX11-NEXT: s_and_b32 s0, s40, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s45, 8 -; GFX11-NEXT: s_and_b32 s2, s44, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s47, 8 +; GFX11-NEXT: s_and_b32 s2, s46, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s30, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: v_readlane_b32 s3, v20, 31 ; GFX11-NEXT: s_and_b32 s2, s41, 0xff -; GFX11-NEXT: s_and_b32 s16, s104, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s91, 8 -; GFX11-NEXT: s_and_b32 s0, s0, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_lshl_b32 s3, s65, 8 +; GFX11-NEXT: s_and_b32 s16, s61, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s45, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 -; GFX11-NEXT: v_readlane_b32 s3, v20, 30 -; GFX11-NEXT: v_readlane_b32 s16, v20, 29 ; GFX11-NEXT: s_and_b32 s2, s28, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s88, 8 -; GFX11-NEXT: s_and_b32 s18, s54, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s44, 8 +; GFX11-NEXT: s_and_b32 s16, s43, 0xff +; GFX11-NEXT: s_lshl_b32 s17, s92, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 ; GFX11-NEXT: s_and_b32 s16, s29, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s101, 8 -; GFX11-NEXT: s_lshl_b32 s19, s42, 8 +; GFX11-NEXT: s_lshl_b32 s17, s42, 8 +; GFX11-NEXT: s_and_b32 s18, s63, 0xff +; GFX11-NEXT: s_lshl_b32 s19, s98, 8 ; GFX11-NEXT: s_or_b32 s16, s16, s17 ; GFX11-NEXT: s_or_b32 s17, s18, s19 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_and_b32 s16, s16, 0xffff ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: s_and_b32 s0, s14, 0xff -; GFX11-NEXT: v_readlane_b32 s14, v20, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: s_lshl_b32 s1, s100, 8 -; GFX11-NEXT: s_and_b32 s2, s52, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s76, 8 +; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 +; GFX11-NEXT: s_and_b32 s0, s14, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s96, 8 +; GFX11-NEXT: s_and_b32 s2, s86, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s78, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s15, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s87, 8 -; GFX11-NEXT: s_and_b32 s14, s14, 0xff -; GFX11-NEXT: s_lshl_b32 s15, s86, 8 +; GFX11-NEXT: s_lshl_b32 s3, s84, 8 +; GFX11-NEXT: s_and_b32 s14, s73, 0xff +; GFX11-NEXT: s_lshl_b32 s15, s83, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s14, s15 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff @@ -174408,131 +173625,149 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s12, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s51, 8 -; GFX11-NEXT: s_and_b32 s12, s84, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s80, 8 +; GFX11-NEXT: s_and_b32 s12, s53, 0xff ; GFX11-NEXT: s_lshl_b32 s14, s72, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s12, s14 -; GFX11-NEXT: v_readlane_b32 s14, v20, 9 ; GFX11-NEXT: s_and_b32 s12, s13, 0xff -; GFX11-NEXT: s_lshl_b32 s13, s82, 8 -; GFX11-NEXT: s_lshl_b32 s15, s65, 8 -; GFX11-NEXT: s_or_b32 s12, s12, s13 -; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_lshl_b32 s13, s36, 8 +; GFX11-NEXT: s_and_b32 s14, s89, 0xff +; GFX11-NEXT: s_lshl_b32 s15, s51, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s13, s14, s15 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s12, s12, s13 +; GFX11-NEXT: s_or_b32 s13, s14, s15 ; GFX11-NEXT: s_and_b32 s12, s12, 0xffff ; GFX11-NEXT: s_lshl_b32 s13, s13, 16 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 -; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 -; GFX11-NEXT: s_and_b32 s0, s10, 0xff -; GFX11-NEXT: v_readlane_b32 s10, v20, 10 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s12, s13 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 -; GFX11-NEXT: s_lshl_b32 s1, s48, 8 -; GFX11-NEXT: s_and_b32 s2, s39, 0xff +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 +; GFX11-NEXT: v_readlane_b32 s1, v20, 1 +; GFX11-NEXT: v_readlane_b32 s2, v20, 0 +; GFX11-NEXT: s_and_b32 s0, s10, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s56, 8 +; GFX11-NEXT: s_and_b32 s10, s75, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s11, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s38, 8 -; GFX11-NEXT: s_and_b32 s10, s10, 0xff -; GFX11-NEXT: s_lshl_b32 s11, s37, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s10, s11 +; GFX11-NEXT: v_readlane_b32 s3, v21, 31 +; GFX11-NEXT: v_readlane_b32 s11, v21, 30 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s10, s11 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s8, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s36, 8 -; GFX11-NEXT: s_and_b32 s8, s35, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s46, 8 +; GFX11-NEXT: v_readlane_b32 s3, v21, 29 +; GFX11-NEXT: v_readlane_b32 s8, v21, 28 +; GFX11-NEXT: v_readlane_b32 s10, v21, 0 +; GFX11-NEXT: v_readlane_b32 s11, v21, 1 +; GFX11-NEXT: v_readlane_b32 s11, v21, 26 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s10, 8 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s8, s10 -; GFX11-NEXT: v_readlane_b32 s10, v20, 11 ; GFX11-NEXT: s_and_b32 s8, s9, 0xff -; GFX11-NEXT: s_lshl_b32 s9, s55, 8 -; GFX11-NEXT: s_lshl_b32 s11, s34, 8 -; GFX11-NEXT: s_or_b32 s8, s8, s9 -; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: v_readlane_b32 s9, v21, 27 +; GFX11-NEXT: s_and_b32 s10, s77, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s11, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_or_b32 s9, s10, s11 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 ; GFX11-NEXT: s_and_b32 s8, s8, 0xffff ; GFX11-NEXT: s_lshl_b32 s9, s9, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 ; GFX11-NEXT: s_or_b32 s3, s8, s9 -; GFX11-NEXT: v_readlane_b32 s8, v20, 4 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 +; GFX11-NEXT: v_readlane_b32 s1, v21, 25 +; GFX11-NEXT: v_readlane_b32 s2, v21, 24 +; GFX11-NEXT: v_readlane_b32 s8, v21, 2 ; GFX11-NEXT: s_and_b32 s0, s6, 0xff -; GFX11-NEXT: v_readlane_b32 s6, v20, 12 -; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3 -; GFX11-NEXT: s_lshl_b32 s1, vcc_hi, 8 -; GFX11-NEXT: s_and_b32 s2, s95, 0xff +; GFX11-NEXT: s_and_b32 s6, s91, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s8, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s7, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s93, 8 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s79, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s3 -; GFX11-NEXT: s_or_b32 s3, s6, s7 -; GFX11-NEXT: v_readlane_b32 s6, v20, 6 +; GFX11-NEXT: v_readlane_b32 s3, v21, 23 +; GFX11-NEXT: v_readlane_b32 s7, v21, 22 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_readlane_b32 s9, v21, 3 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s3, s6, s7 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s4, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s75, 8 -; GFX11-NEXT: s_and_b32 s4, s63, 0xff +; GFX11-NEXT: v_readlane_b32 s3, v21, 21 +; GFX11-NEXT: v_readlane_b32 s4, v21, 20 +; GFX11-NEXT: v_readlane_b32 s6, v21, 4 +; GFX11-NEXT: v_readlane_b32 s7, v21, 5 +; GFX11-NEXT: v_readlane_b32 s7, v21, 18 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s4, s4, 0xff ; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: v_readlane_b32 s7, v20, 7 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s4, s6 ; GFX11-NEXT: s_and_b32 s4, s5, 0xff -; GFX11-NEXT: v_readlane_b32 s5, v20, 28 -; GFX11-NEXT: v_readlane_b32 s6, v20, 13 -; GFX11-NEXT: v_readlane_b32 s7, v20, 27 +; GFX11-NEXT: v_readlane_b32 s5, v21, 19 +; GFX11-NEXT: v_readlane_b32 s6, v21, 17 +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_lshl_b32 s5, s5, 8 ; GFX11-NEXT: s_and_b32 s6, s6, 0xff -; GFX11-NEXT: s_lshl_b32 s7, s7, 8 ; GFX11-NEXT: s_or_b32 s4, s4, s5 ; GFX11-NEXT: s_or_b32 s5, s6, s7 ; GFX11-NEXT: s_and_b32 s4, s4, 0xffff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s4, s5 -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:64 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 -; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 -; GFX11-NEXT: v_readlane_b32 s9, v20, 5 +; GFX11-NEXT: v_dual_mov_b32 v9, s0 :: v_dual_mov_b32 v10, s1 +; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:112 -; GFX11-NEXT: v_readlane_b32 s104, v19, 8 -; GFX11-NEXT: v_readlane_b32 s101, v19, 5 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:112 ; GFX11-NEXT: v_readlane_b32 s100, v19, 4 +; GFX11-NEXT: v_readlane_b32 s99, v19, 3 +; GFX11-NEXT: v_readlane_b32 s98, v19, 2 +; GFX11-NEXT: v_readlane_b32 s97, v19, 1 +; GFX11-NEXT: v_readlane_b32 s96, v19, 0 ; GFX11-NEXT: v_readlane_b32 s87, v18, 31 ; GFX11-NEXT: v_readlane_b32 s86, v18, 30 +; GFX11-NEXT: v_readlane_b32 s85, v18, 29 ; GFX11-NEXT: v_readlane_b32 s84, v18, 28 ; GFX11-NEXT: v_readlane_b32 s83, v18, 27 ; GFX11-NEXT: v_readlane_b32 s82, v18, 26 +; GFX11-NEXT: v_readlane_b32 s81, v18, 25 ; GFX11-NEXT: v_readlane_b32 s80, v18, 24 ; GFX11-NEXT: v_readlane_b32 s71, v18, 23 +; GFX11-NEXT: v_readlane_b32 s70, v18, 22 +; GFX11-NEXT: v_readlane_b32 s69, v18, 21 ; GFX11-NEXT: v_readlane_b32 s68, v18, 20 ; GFX11-NEXT: v_readlane_b32 s67, v18, 19 ; GFX11-NEXT: v_readlane_b32 s66, v18, 18 @@ -174540,8 +173775,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: v_readlane_b32 s64, v18, 16 ; GFX11-NEXT: v_readlane_b32 s55, v18, 15 ; GFX11-NEXT: v_readlane_b32 s54, v18, 14 +; GFX11-NEXT: v_readlane_b32 s53, v18, 13 ; GFX11-NEXT: v_readlane_b32 s52, v18, 12 ; GFX11-NEXT: v_readlane_b32 s51, v18, 11 +; GFX11-NEXT: v_readlane_b32 s50, v18, 10 ; GFX11-NEXT: v_readlane_b32 s49, v18, 9 ; GFX11-NEXT: v_readlane_b32 s48, v18, 8 ; GFX11-NEXT: v_readlane_b32 s39, v18, 7 @@ -174550,15 +173787,150 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: v_readlane_b32 s36, v18, 4 ; GFX11-NEXT: v_readlane_b32 s35, v18, 3 ; GFX11-NEXT: v_readlane_b32 s34, v18, 2 +; GFX11-NEXT: v_readlane_b32 s31, v18, 1 ; GFX11-NEXT: v_readlane_b32 s30, v18, 0 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x2 ; 12-byte Folded Reload +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v18, off, s32 ; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v20, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB91_4: +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v21, vcc_lo, 0 +; GFX11-NEXT: ; implicit-def: $vcc_hi +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; implicit-def: $sgpr57 +; GFX11-NEXT: ; kill: killed $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr57 +; GFX11-NEXT: ; kill: killed $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr57 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr57 +; GFX11-NEXT: v_writelane_b32 v21, vcc_hi, 1 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr57 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr57 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr57 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr57 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: v_writelane_b32 v21, vcc_lo, 2 +; GFX11-NEXT: ; kill: killed $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr57 +; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr57 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr57 +; GFX11-NEXT: s_mov_b32 s34, -1 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr56 +; GFX11-NEXT: ; kill: killed $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr57 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: v_writelane_b32 v21, vcc_hi, 3 +; GFX11-NEXT: ; kill: killed $sgpr56 +; GFX11-NEXT: ; implicit-def: $sgpr56 +; GFX11-NEXT: ; kill: killed $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr104 +; GFX11-NEXT: ; implicit-def: $sgpr103 +; GFX11-NEXT: ; implicit-def: $sgpr90 +; GFX11-NEXT: ; implicit-def: $sgpr102 +; GFX11-NEXT: ; implicit-def: $sgpr101 +; GFX11-NEXT: ; implicit-def: $sgpr64 +; GFX11-NEXT: ; implicit-def: $sgpr100 +; GFX11-NEXT: ; implicit-def: $sgpr88 +; GFX11-NEXT: ; implicit-def: $sgpr99 +; GFX11-NEXT: ; implicit-def: $sgpr66 +; GFX11-NEXT: ; implicit-def: $sgpr97 +; GFX11-NEXT: ; implicit-def: $sgpr55 +; GFX11-NEXT: ; implicit-def: $sgpr87 +; GFX11-NEXT: ; implicit-def: $sgpr76 +; GFX11-NEXT: ; implicit-def: $sgpr85 +; GFX11-NEXT: ; implicit-def: $sgpr54 +; GFX11-NEXT: ; implicit-def: $sgpr82 +; GFX11-NEXT: ; implicit-def: $sgpr81 +; GFX11-NEXT: ; implicit-def: $sgpr74 +; GFX11-NEXT: ; implicit-def: $sgpr71 +; GFX11-NEXT: ; implicit-def: $sgpr69 +; GFX11-NEXT: ; implicit-def: $sgpr50 +; GFX11-NEXT: ; implicit-def: $sgpr49 +; GFX11-NEXT: ; implicit-def: $sgpr62 +; GFX11-NEXT: ; implicit-def: $sgpr48 +; GFX11-NEXT: ; implicit-def: $sgpr39 +; GFX11-NEXT: ; implicit-def: $sgpr38 +; GFX11-NEXT: ; implicit-def: $sgpr60 +; GFX11-NEXT: ; implicit-def: $sgpr58 +; GFX11-NEXT: ; implicit-def: $sgpr67 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr70 +; GFX11-NEXT: ; implicit-def: $sgpr37 +; GFX11-NEXT: ; implicit-def: $sgpr94 +; GFX11-NEXT: ; implicit-def: $sgpr52 +; GFX11-NEXT: ; implicit-def: $sgpr68 +; GFX11-NEXT: ; implicit-def: $sgpr35 +; GFX11-NEXT: ; implicit-def: $sgpr47 +; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: ; implicit-def: $sgpr30 +; GFX11-NEXT: ; implicit-def: $sgpr65 +; GFX11-NEXT: ; implicit-def: $sgpr61 +; GFX11-NEXT: ; implicit-def: $sgpr45 +; GFX11-NEXT: ; implicit-def: $sgpr44 +; GFX11-NEXT: ; implicit-def: $sgpr43 +; GFX11-NEXT: ; implicit-def: $sgpr92 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr63 +; GFX11-NEXT: ; implicit-def: $sgpr98 +; GFX11-NEXT: ; implicit-def: $sgpr96 +; GFX11-NEXT: ; implicit-def: $sgpr86 +; GFX11-NEXT: ; implicit-def: $sgpr78 +; GFX11-NEXT: ; implicit-def: $sgpr84 +; GFX11-NEXT: ; implicit-def: $sgpr73 +; GFX11-NEXT: ; implicit-def: $sgpr83 +; GFX11-NEXT: ; implicit-def: $sgpr80 +; GFX11-NEXT: ; implicit-def: $sgpr53 +; GFX11-NEXT: ; implicit-def: $sgpr72 +; GFX11-NEXT: ; implicit-def: $sgpr36 +; GFX11-NEXT: ; implicit-def: $sgpr89 +; GFX11-NEXT: ; implicit-def: $sgpr51 +; GFX11-NEXT: ; kill: killed $sgpr56 +; GFX11-NEXT: ; implicit-def: $sgpr56 +; GFX11-NEXT: ; implicit-def: $sgpr75 +; GFX11-NEXT: ; implicit-def: $sgpr77 +; GFX11-NEXT: ; implicit-def: $sgpr91 +; GFX11-NEXT: ; kill: killed $sgpr57 +; GFX11-NEXT: ; implicit-def: $sgpr57 +; GFX11-NEXT: ; kill: killed $sgpr57 +; GFX11-NEXT: v_writelane_b32 v21, vcc_lo, 4 +; GFX11-NEXT: v_writelane_b32 v21, vcc_hi, 5 +; GFX11-NEXT: s_branch .LBB91_2 %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false @@ -176841,22 +176213,22 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 @@ -176874,148 +176246,148 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v1 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:184 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v5 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v9 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v11 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v13 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v15 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 -; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v15 -; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v17 -; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v14 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v49 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v51 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v33 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v53 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v43 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 @@ -177023,506 +176395,410 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v3 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:340 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v3 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:348 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:356 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:364 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:372 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v0 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v61, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v0 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:44 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:20 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:12 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:60 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB92_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v28, v33, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v48, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v54, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v40, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v46, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v14, v37, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v25, v25, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v15, v39, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: v_or_b32_sdwa v26, v26, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v4, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v5, v5, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v63, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v60, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v10, v56, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v58, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v12, v63, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v34, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v35, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v38, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v25, v25, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v26, v26, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v36, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v50, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v30, v30, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v55, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v31, v31, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v42, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; kill: killed $vgpr32 @@ -177627,398 +176903,553 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: .LBB92_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB92_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v18, 0x300 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: v_or_b32_sdwa v29, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v0, 3, v46 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 3, v36 +; VI-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v4, 3, v33 +; VI-NEXT: v_add_u16_e32 v37, 3, v37 +; VI-NEXT: v_add_u16_e32 v35, 3, v35 +; VI-NEXT: v_add_u16_e32 v34, 3, v34 +; VI-NEXT: v_add_u16_e32 v3, 3, v50 +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v3, 3, v48 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v2, 3, v55 +; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v2, 3, v54 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v1, 3, v42 +; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v1, 3, v40 +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v57, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v4, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v0, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v29, 0x300, v29 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v9, 3, v9 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v9, 3, v9 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v2, 0x300, v3 -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v10, 3, v10 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v10, 3, v10 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v2, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v1, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v4 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v3, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_u16_e32 v4, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v37, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_u16_e32 v6, 3, v6 -; VI-NEXT: v_or_b32_sdwa v6, v33, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v11, 3, v11 +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v5, 3, v5 -; VI-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_add_u16_e32 v11, 3, v11 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v4, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v4, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v51, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v12, 3, v12 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v49, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v5, 3, v5 -; VI-NEXT: v_or_b32_sdwa v5, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v5, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v6, v5 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v12, 3, v12 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v6, 3, v6 -; VI-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v32, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v6, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v32, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v13, 3, v13 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v13, 3, v13 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v28, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 -; VI-NEXT: v_or_b32_e32 v28, v28, v32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v33, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v33, v33, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v14, 3, v14 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v27, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v27, 0x300, v27 -; VI-NEXT: v_or_b32_e32 v27, v27, v33 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v34, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v34, v34, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v26, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 -; VI-NEXT: v_or_b32_e32 v26, v26, v34 +; VI-NEXT: v_or_b32_sdwa v17, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v35, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v35, v35, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 -; VI-NEXT: v_or_b32_e32 v25, v25, v35 +; VI-NEXT: v_or_b32_sdwa v16, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v36, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 -; VI-NEXT: v_or_b32_e32 v6, v7, v6 -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v36, v36, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v24, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 -; VI-NEXT: v_or_b32_e32 v24, v24, v36 +; VI-NEXT: v_or_b32_sdwa v36, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v37, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v37, v37, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 3, v39 +; VI-NEXT: v_or_b32_sdwa v39, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v14, 3, v38 +; VI-NEXT: v_add_u16_e32 v38, 3, v63 +; VI-NEXT: v_mov_b32_e32 v63, 0x300 +; VI-NEXT: v_add_u16_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v19, v12, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v32 +; VI-NEXT: v_add_u16_sdwa v20, v11, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v10, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v22, v9, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v23, v8, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v27, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v28, v3, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v29, v2, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v30, v1, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v31, v0, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v33, v33, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v36, v36, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v39, v39, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v36 +; VI-NEXT: v_or_b32_e32 v16, v16, v33 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v14, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v39 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v23, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v7, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 -; VI-NEXT: v_or_b32_e32 v23, v23, v37 +; VI-NEXT: v_or_b32_sdwa v37, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v35, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v34, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v12, v34, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v38, v18, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v34, 3, v34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 -; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_u16_e32 v8, 3, v63 +; VI-NEXT: v_add_u16_e32 v48, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v9, 3, v62 -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v8, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 -; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_u16_e32 v9, 3, v61 +; VI-NEXT: v_or_b32_sdwa v48, v18, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v11, v48, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v10, 3, v60 -; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v9, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 -; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_u16_e32 v10, 3, v57 +; VI-NEXT: v_add_u16_e32 v49, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v11, 3, v56 -; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v10, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 -; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_u16_e32 v11, 3, v59 +; VI-NEXT: v_or_b32_sdwa v49, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v12, 3, v58 -; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v11, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 -; VI-NEXT: v_or_b32_e32 v11, v12, v11 -; VI-NEXT: v_add_u16_e32 v12, 3, v47 +; VI-NEXT: v_add_u16_e32 v50, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v53, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v50, v18, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v10, v50, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v13, 3, v46 -; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v12, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 -; VI-NEXT: v_or_b32_e32 v12, v13, v12 -; VI-NEXT: v_add_u16_e32 v13, 3, v45 +; VI-NEXT: v_add_u16_e32 v51, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v52, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v24, v7, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v14, 3, v44 -; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v13, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 -; VI-NEXT: v_or_b32_e32 v13, v14, v13 -; VI-NEXT: v_add_u16_e32 v14, 3, v43 +; VI-NEXT: v_or_b32_sdwa v51, v18, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v15, 3, v42 -; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v14, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 -; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: v_add_u16_e32 v52, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v52, v18, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v9, v52, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v16, 3, v16 -; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_add_u16_e32 v53, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v53, v18, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v17, 3, v17 -; VI-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: v_add_u16_e32 v54, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v19, 3, v19 -; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v54, v18, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v8, v54, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v20, 3, v20 -; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v21, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v17, v17, v21 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v16, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v19, 0x300, v20 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_e32 v16, v19, v16 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v55, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v55, v18, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v43, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v21, 3, v21 -; VI-NEXT: v_or_b32_sdwa v30, v39, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v30, 0x300, v30 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u16_e32 v20, 3, v20 -; VI-NEXT: v_or_b32_sdwa v31, v51, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v31, 0x300, v31 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u16_e32 v21, 3, v21 -; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v40, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v29, v29, v40 +; VI-NEXT: v_add_u16_e32 v40, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v38, v38, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v38, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v40, v18, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v41, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v25, v6, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v7, v40, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v20, 3, v20 -; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v55, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 -; VI-NEXT: v_or_b32_e32 v22, v22, v38 -; VI-NEXT: v_or_b32_e32 v30, v30, v55 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v39, 3, v39 -; VI-NEXT: v_or_b32_sdwa v39, v48, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v21, v39, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v48, 3, v48 -; VI-NEXT: v_or_b32_sdwa v48, v49, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v49, 3, v49 -; VI-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v20, v49, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v50, 3, v50 -; VI-NEXT: v_or_b32_sdwa v50, v51, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v19, 3, v19 -; VI-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v41, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v41, v18, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v45, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v39, 3, v39 -; VI-NEXT: v_or_b32_sdwa v39, v49, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v39, 0x300, v39 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v51, 3, v51 -; VI-NEXT: v_or_b32_sdwa v51, v52, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v52, 3, v52 -; VI-NEXT: v_or_b32_sdwa v52, v53, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v53, 3, v53 -; VI-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v54, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v19, v51, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v18, v53, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v18, v39, v18 -; VI-NEXT: v_add_u16_e32 v39, 0x300, v52 -; VI-NEXT: v_or_b32_e32 v19, v39, v19 -; VI-NEXT: v_add_u16_e32 v39, 0x300, v50 -; VI-NEXT: v_or_b32_e32 v20, v39, v20 -; VI-NEXT: v_add_u16_e32 v39, 0x300, v48 -; VI-NEXT: v_or_b32_e32 v21, v39, v21 -; VI-NEXT: v_or_b32_e32 v31, v31, v54 +; VI-NEXT: v_add_u16_e32 v42, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v42, v18, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v44, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v26, v5, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v6, v42, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v43, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v43, v18, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v44, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v44, v18, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v5, v44, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v45, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v45, v18, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v46, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v46, v18, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v4, v46, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v47, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v47, v18, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v56, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v56, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v3, v56, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v57, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v57, v18, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v58, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v58, v18, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v2, v58, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v59, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v59, v18, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v60, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v60, v18, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v1, v60, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v61, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v61, v18, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v62, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v62, v18, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v18, v13, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v13, v37, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v18, v32, v18 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v0, v62, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v34, v37, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v19, v32, v19 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v0, v34, v0 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v61 +; VI-NEXT: v_or_b32_e32 v1, v34, v1 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v59 +; VI-NEXT: v_or_b32_e32 v2, v34, v2 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v57 +; VI-NEXT: v_or_b32_e32 v3, v34, v3 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v47 +; VI-NEXT: v_or_b32_e32 v4, v34, v4 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v45 +; VI-NEXT: v_or_b32_e32 v5, v34, v5 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v43 +; VI-NEXT: v_or_b32_e32 v6, v34, v6 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v41 +; VI-NEXT: v_or_b32_e32 v7, v34, v7 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v55 +; VI-NEXT: v_or_b32_e32 v8, v34, v8 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v53 +; VI-NEXT: v_or_b32_e32 v9, v34, v9 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v51 +; VI-NEXT: v_or_b32_e32 v10, v34, v10 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v49 +; VI-NEXT: v_or_b32_e32 v11, v34, v11 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v38 +; VI-NEXT: v_or_b32_e32 v12, v34, v12 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v35 +; VI-NEXT: v_or_b32_e32 v13, v34, v13 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v20, v32, v20 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v21, v32, v21 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v22, v32, v22 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v23, v32, v23 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v24, v32, v24 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v25, v32, v25 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v26, v32, v26 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v27, v32, v27 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v28, v32, v28 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v29, v32, v29 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v30, v32, v30 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v31, v32, v31 ; VI-NEXT: .LBB92_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload @@ -178059,22 +177490,22 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 @@ -178093,192 +177524,186 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v1 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:184 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v9 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v11 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v17 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v23 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v25 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v11 -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v13 -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v15 -; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v17 -; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v19 -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v23 -; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v25 -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v37 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v48 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v33 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v49 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v52 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v34 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v35 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:156 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v53 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v42 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v43 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v38 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -178286,26 +177711,24 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -178313,460 +177736,373 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:324 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:340 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v47, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:348 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:356 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:364 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:372 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v1 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:44 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:84 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB92_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_or_b32_sdwa v28, v38, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v48, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v54, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v40, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v32, v45, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v21, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v62, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v10, v57, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v59, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v13, v63, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v14, v34, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v43, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v16, v17, v16, s6 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v17, v19, v18, s6 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v18, v35, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v17, v18, v17, s6 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v18, v19, v18, s6 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v36, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v19, v20, v19, s6 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v20, v21, v20, s6 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v21, v22, v21, s6 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v22, v23, v22, s6 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v49, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v52, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v30, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v41, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v30, v31, v30, s6 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; kill: killed $vgpr32 @@ -178869,403 +178205,537 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: .LBB92_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB92_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u16_e32 v0, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v30, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v25, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v26, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v27, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v28, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v49 +; GFX9-NEXT: v_mov_b32_e32 v1, v35 +; GFX9-NEXT: v_or_b32_sdwa v35, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v48 +; GFX9-NEXT: v_mov_b32_e32 v2, v36 +; GFX9-NEXT: v_or_b32_sdwa v36, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v3, v37 +; GFX9-NEXT: v_mov_b32_e32 v4, v39 ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v2 -; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v3 -; GFX9-NEXT: v_perm_b32 v0, v2, v0, s6 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v37, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v39, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 -; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s6 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 -; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v38, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 -; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 -; GFX9-NEXT: v_add_u16_e32 v35, 0x300, v25 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v48, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v38, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v23, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v24, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v18, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v19, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v20, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v21, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v36, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v22 -; GFX9-NEXT: v_add_u16_e32 v36, 0x300, v36 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v22, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 -; GFX9-NEXT: v_or_b32_sdwa v6, v33, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 -; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v23, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v37, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v33, 0x300, v21 -; GFX9-NEXT: v_add_u16_e32 v34, 0x300, v23 -; GFX9-NEXT: v_perm_b32 v29, v34, v29, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 -; GFX9-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_or_b32_sdwa v49, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v49, 0x300, v49 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v50, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v38, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v51, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v52, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v53, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v39, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 -; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v39 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v54, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v40, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v48, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v41, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 -; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v43, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v45, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v46, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v47, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v56, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v57, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v58, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v34, 3, v34 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 -; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 -; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v60 -; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 -; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 -; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v57 +; GFX9-NEXT: v_or_b32_sdwa v59, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v63 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v56 -; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 -; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 -; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v59 +; GFX9-NEXT: v_or_b32_sdwa v60, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v58 -; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 -; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 -; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v61, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v46 -; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 -; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 -; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v62, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 -; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 -; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v63, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v15, 3, v42 -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 -; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 -; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: v_or_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 -; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 -; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 +; GFX9-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v16 -; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v18 -; GFX9-NEXT: v_perm_b32 v17, v17, v20, s6 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v19 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v16, v18, v16, s6 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v49, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v20 -; GFX9-NEXT: v_perm_b32 v30, v33, v30, s6 +; GFX9-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v50, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v52, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v39, 0x300, v50 -; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v18 +; GFX9-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v51, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v51 +; GFX9-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v52, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v32, 0x300, v19 -; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 +; GFX9-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v53, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v53 +; GFX9-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v54, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_perm_b32 v6, v6, v7, s6 +; GFX9-NEXT: v_perm_b32 v7, v8, v9, s6 +; GFX9-NEXT: v_perm_b32 v8, v14, v15, s6 +; GFX9-NEXT: v_perm_b32 v9, v10, v11, s6 +; GFX9-NEXT: v_perm_b32 v10, v12, v13, s6 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v55, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v55 +; GFX9-NEXT: v_or_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v40, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v50, 0x300, v40 -; GFX9-NEXT: v_perm_b32 v21, v50, v21, s6 +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v41, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v41 +; GFX9-NEXT: v_or_b32_sdwa v42, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v42, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v51, 0x300, v42 -; GFX9-NEXT: v_perm_b32 v20, v51, v20, s6 +; GFX9-NEXT: v_or_b32_sdwa v44, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v43, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v43 +; GFX9-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v44, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 +; GFX9-NEXT: v_perm_b32 v3, v31, v3, s6 +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v27 +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v23 +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v50 +; GFX9-NEXT: v_add_u16_e32 v50, 0x300, v51 +; GFX9-NEXT: v_add_u16_e32 v51, 0x300, v53 +; GFX9-NEXT: v_add_u16_e32 v53, 0x300, v41 +; GFX9-NEXT: v_add_u16_e32 v41, 0x300, v59 +; GFX9-NEXT: v_add_u16_e32 v59, 0x300, v60 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v45, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v45 +; GFX9-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v32, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v32, 0x300, v32 +; GFX9-NEXT: v_perm_b32 v2, v32, v2, s6 +; GFX9-NEXT: v_add_u16_e32 v32, 0x300, v25 +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v20 +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v40 +; GFX9-NEXT: v_add_u16_e32 v40, 0x300, v57 +; GFX9-NEXT: v_add_u16_e32 v57, 0x300, v17 +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v44 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v46, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v24 -; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v26 -; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v37 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v33, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v33, 0x300, v33 +; GFX9-NEXT: v_perm_b32 v1, v33, v1, s6 +; GFX9-NEXT: v_add_u16_e32 v33, 0x300, v26 +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v18 +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v46 +; GFX9-NEXT: v_add_u16_e32 v46, 0x300, v16 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v4 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v42 +; GFX9-NEXT: v_perm_b32 v4, v17, v4, s6 +; GFX9-NEXT: v_perm_b32 v5, v16, v5, s6 +; GFX9-NEXT: v_perm_b32 v11, v46, v57, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v34, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v34, 0x300, v34 +; GFX9-NEXT: v_perm_b32 v0, v34, v0, s6 +; GFX9-NEXT: v_add_u16_e32 v34, 0x300, v28 +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v48 +; GFX9-NEXT: v_add_u16_e32 v48, 0x300, v21 +; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v54 +; GFX9-NEXT: v_add_u16_e32 v54, 0x300, v45 +; GFX9-NEXT: v_add_u16_e32 v45, 0x300, v63 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v30 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v35 +; GFX9-NEXT: v_add_u16_e32 v35, 0x300, v36 +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v37 +; GFX9-NEXT: v_add_u16_e32 v36, 0x300, v39 ; GFX9-NEXT: v_add_u16_e32 v37, 0x300, v38 -; GFX9-NEXT: v_add_u16_e32 v38, 0x300, v48 -; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v49 -; GFX9-NEXT: v_add_u16_e32 v48, 0x300, v52 -; GFX9-NEXT: v_add_u16_e32 v49, 0x300, v54 -; GFX9-NEXT: v_add_u16_e32 v52, 0x300, v44 -; GFX9-NEXT: v_add_u16_e32 v53, 0x300, v46 -; GFX9-NEXT: v_perm_b32 v18, v53, v18, s6 -; GFX9-NEXT: v_perm_b32 v19, v52, v19, s6 -; GFX9-NEXT: v_perm_b32 v22, v49, v22, s6 -; GFX9-NEXT: v_perm_b32 v23, v48, v23, s6 -; GFX9-NEXT: v_perm_b32 v24, v39, v24, s6 -; GFX9-NEXT: v_perm_b32 v25, v38, v25, s6 -; GFX9-NEXT: v_perm_b32 v26, v37, v26, s6 -; GFX9-NEXT: v_perm_b32 v27, v36, v27, s6 -; GFX9-NEXT: v_perm_b32 v28, v35, v28, s6 +; GFX9-NEXT: v_add_u16_e32 v38, 0x300, v24 +; GFX9-NEXT: v_add_u16_e32 v39, 0x300, v19 +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v22 +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v52 +; GFX9-NEXT: v_add_u16_e32 v52, 0x300, v55 +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v43 +; GFX9-NEXT: v_add_u16_e32 v55, 0x300, v47 +; GFX9-NEXT: v_perm_b32 v16, v55, v18, s6 +; GFX9-NEXT: v_perm_b32 v17, v54, v19, s6 +; GFX9-NEXT: v_perm_b32 v18, v53, v20, s6 +; GFX9-NEXT: v_perm_b32 v19, v52, v21, s6 +; GFX9-NEXT: v_perm_b32 v20, v51, v22, s6 +; GFX9-NEXT: v_perm_b32 v21, v50, v23, s6 +; GFX9-NEXT: v_perm_b32 v22, v49, v24, s6 +; GFX9-NEXT: v_perm_b32 v23, v48, v25, s6 +; GFX9-NEXT: v_perm_b32 v24, v39, v26, s6 +; GFX9-NEXT: v_perm_b32 v25, v38, v27, s6 +; GFX9-NEXT: v_perm_b32 v26, v37, v28, s6 +; GFX9-NEXT: v_perm_b32 v27, v36, v29, s6 +; GFX9-NEXT: v_perm_b32 v28, v35, v30, s6 +; GFX9-NEXT: v_perm_b32 v29, v34, v31, s6 +; GFX9-NEXT: v_perm_b32 v30, v33, v32, s6 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v47, 0x300, v56 +; GFX9-NEXT: v_add_u16_e32 v56, 0x300, v58 +; GFX9-NEXT: v_add_u16_e32 v43, 0x300, v61 +; GFX9-NEXT: v_add_u16_e32 v58, 0x300, v62 +; GFX9-NEXT: v_perm_b32 v12, v45, v58, s6 +; GFX9-NEXT: v_perm_b32 v13, v43, v59, s6 +; GFX9-NEXT: v_perm_b32 v14, v41, v56, s6 +; GFX9-NEXT: v_perm_b32 v15, v40, v47, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 ; GFX9-NEXT: .LBB92_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload @@ -180950,12 +180420,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s10, s16 +; SI-NEXT: s_mov_b32 s61, s21 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_writelane_b32 v61, s29, 0 ; SI-NEXT: v_writelane_b32 v61, s28, 1 -; SI-NEXT: v_writelane_b32 v61, s27, 2 -; SI-NEXT: s_mov_b32 s61, s21 ; SI-NEXT: v_writelane_b32 v63, s30, 0 ; SI-NEXT: v_writelane_b32 v63, s31, 1 ; SI-NEXT: v_writelane_b32 v63, s34, 2 @@ -180990,39 +180458,42 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_writelane_b32 v63, s87, 31 ; SI-NEXT: v_writelane_b32 v63, s96, 32 ; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: s_mov_b32 s67, s19 -; SI-NEXT: s_mov_b32 s54, s17 -; SI-NEXT: s_mov_b32 s35, s23 -; SI-NEXT: s_mov_b32 s39, s26 -; SI-NEXT: s_mov_b32 s62, s25 +; SI-NEXT: s_mov_b32 s67, s23 +; SI-NEXT: s_mov_b32 s54, s19 ; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s99, v1 -; SI-NEXT: v_readfirstlane_b32 s74, v24 +; SI-NEXT: s_mov_b32 s35, s26 +; SI-NEXT: v_readfirstlane_b32 s38, v1 +; SI-NEXT: v_readfirstlane_b32 s45, v21 ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s6, v23 +; SI-NEXT: v_readfirstlane_b32 s74, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v62, s74, 0 +; SI-NEXT: v_writelane_b32 v62, s45, 0 +; SI-NEXT: v_readfirstlane_b32 s88, v23 +; SI-NEXT: v_writelane_b32 v62, s74, 1 ; SI-NEXT: v_readfirstlane_b32 s12, v26 -; SI-NEXT: v_writelane_b32 v62, s6, 1 +; SI-NEXT: v_writelane_b32 v62, s88, 2 ; SI-NEXT: v_readfirstlane_b32 s14, v25 -; SI-NEXT: v_writelane_b32 v62, s12, 2 +; SI-NEXT: v_writelane_b32 v62, s12, 3 ; SI-NEXT: v_readfirstlane_b32 s46, v28 -; SI-NEXT: v_writelane_b32 v62, s14, 3 +; SI-NEXT: v_writelane_b32 v62, s14, 4 ; SI-NEXT: v_readfirstlane_b32 s56, v27 -; SI-NEXT: v_writelane_b32 v62, s46, 4 +; SI-NEXT: v_writelane_b32 v62, s46, 5 ; SI-NEXT: v_readfirstlane_b32 s57, v30 -; SI-NEXT: v_writelane_b32 v62, s56, 5 -; SI-NEXT: v_readfirstlane_b32 s59, v29 -; SI-NEXT: v_writelane_b32 v62, s57, 6 -; SI-NEXT: v_writelane_b32 v62, s59, 7 +; SI-NEXT: v_writelane_b32 v62, s56, 6 +; SI-NEXT: s_mov_b32 s62, s25 +; SI-NEXT: v_readfirstlane_b32 s25, v29 +; SI-NEXT: v_writelane_b32 v62, s57, 7 +; SI-NEXT: v_writelane_b32 v62, s25, 8 +; SI-NEXT: s_mov_b32 s10, s16 ; SI-NEXT: s_mov_b32 s60, s20 ; SI-NEXT: s_mov_b32 s63, s24 +; SI-NEXT: s_mov_b32 s77, s27 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s99, v2 ; SI-NEXT: v_readfirstlane_b32 s95, v3 -; SI-NEXT: v_readfirstlane_b32 s31, v5 -; SI-NEXT: v_readfirstlane_b32 s24, v9 -; SI-NEXT: v_readfirstlane_b32 s38, v12 -; SI-NEXT: v_readfirstlane_b32 s36, v11 +; SI-NEXT: v_readfirstlane_b32 s90, v7 +; SI-NEXT: v_readfirstlane_b32 s31, v12 +; SI-NEXT: v_readfirstlane_b32 s24, v11 ; SI-NEXT: v_readfirstlane_b32 s8, v14 ; SI-NEXT: v_readfirstlane_b32 s27, v13 ; SI-NEXT: v_readfirstlane_b32 s9, v16 @@ -181031,18 +180502,17 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_readfirstlane_b32 s15, v17 ; SI-NEXT: v_readfirstlane_b32 s42, v20 ; SI-NEXT: v_readfirstlane_b32 s43, v19 -; SI-NEXT: v_readfirstlane_b32 s44, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 -; SI-NEXT: v_writelane_b32 v61, s4, 3 -; SI-NEXT: v_readfirstlane_b32 s45, v21 -; SI-NEXT: v_readfirstlane_b32 s98, v10 -; SI-NEXT: v_readfirstlane_b32 s90, v8 -; SI-NEXT: v_readfirstlane_b32 s88, v7 -; SI-NEXT: v_readfirstlane_b32 s91, v6 -; SI-NEXT: v_readfirstlane_b32 s93, v4 -; SI-NEXT: v_readfirstlane_b32 s55, v2 +; SI-NEXT: v_writelane_b32 v61, s4, 2 +; SI-NEXT: v_readfirstlane_b32 s44, v22 +; SI-NEXT: v_readfirstlane_b32 s93, v10 +; SI-NEXT: v_readfirstlane_b32 s20, v9 +; SI-NEXT: v_readfirstlane_b32 s96, v8 +; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: v_readfirstlane_b32 s91, v5 +; SI-NEXT: v_readfirstlane_b32 s55, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill @@ -181060,124 +180530,125 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 -; SI-NEXT: v_writelane_b32 v61, s4, 4 +; SI-NEXT: v_writelane_b32 v61, s4, 3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 -; SI-NEXT: v_writelane_b32 v61, s4, 5 +; SI-NEXT: v_writelane_b32 v61, s4, 4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 -; SI-NEXT: v_writelane_b32 v61, s4, 6 +; SI-NEXT: v_writelane_b32 v61, s4, 5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 -; SI-NEXT: v_writelane_b32 v61, s4, 7 +; SI-NEXT: v_writelane_b32 v61, s4, 6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:308 -; SI-NEXT: v_writelane_b32 v61, s4, 8 +; SI-NEXT: v_writelane_b32 v61, s4, 7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:304 -; SI-NEXT: v_writelane_b32 v61, s4, 9 +; SI-NEXT: v_writelane_b32 v61, s4, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 -; SI-NEXT: v_writelane_b32 v61, s4, 10 +; SI-NEXT: v_writelane_b32 v61, s4, 9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 -; SI-NEXT: v_writelane_b32 v61, s4, 11 +; SI-NEXT: v_writelane_b32 v61, s4, 10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 -; SI-NEXT: v_writelane_b32 v61, s4, 12 +; SI-NEXT: v_writelane_b32 v61, s4, 11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 -; SI-NEXT: v_writelane_b32 v61, s4, 13 +; SI-NEXT: v_writelane_b32 v61, s4, 12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 -; SI-NEXT: v_writelane_b32 v61, s4, 14 +; SI-NEXT: v_writelane_b32 v61, s4, 13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:280 -; SI-NEXT: v_writelane_b32 v61, s4, 15 +; SI-NEXT: v_writelane_b32 v61, s4, 14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 -; SI-NEXT: v_writelane_b32 v61, s4, 16 +; SI-NEXT: v_writelane_b32 v61, s4, 15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v61, s4, 17 +; SI-NEXT: v_writelane_b32 v61, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 -; SI-NEXT: v_writelane_b32 v61, s4, 18 +; SI-NEXT: v_writelane_b32 v61, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 -; SI-NEXT: v_writelane_b32 v61, s4, 19 +; SI-NEXT: v_writelane_b32 v61, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 -; SI-NEXT: v_writelane_b32 v61, s4, 20 +; SI-NEXT: v_writelane_b32 v61, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v61, s4, 21 +; SI-NEXT: v_writelane_b32 v61, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 -; SI-NEXT: v_writelane_b32 v61, s4, 22 +; SI-NEXT: v_writelane_b32 v61, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 -; SI-NEXT: v_writelane_b32 v61, s4, 23 +; SI-NEXT: v_writelane_b32 v61, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244 -; SI-NEXT: v_writelane_b32 v61, s4, 24 +; SI-NEXT: v_writelane_b32 v61, s4, 23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 -; SI-NEXT: v_writelane_b32 v61, s4, 25 +; SI-NEXT: v_writelane_b32 v61, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236 -; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: v_writelane_b32 v61, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 -; SI-NEXT: v_writelane_b32 v61, s4, 27 +; SI-NEXT: v_writelane_b32 v61, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228 -; SI-NEXT: v_writelane_b32 v61, s4, 28 +; SI-NEXT: v_writelane_b32 v61, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 -; SI-NEXT: v_writelane_b32 v61, s4, 29 +; SI-NEXT: v_writelane_b32 v61, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v61, s4, 30 +; SI-NEXT: v_writelane_b32 v61, s4, 29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 -; SI-NEXT: v_writelane_b32 v61, s4, 31 +; SI-NEXT: v_writelane_b32 v61, s4, 30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 -; SI-NEXT: v_writelane_b32 v61, s4, 32 +; SI-NEXT: v_writelane_b32 v61, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s16, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 +; SI-NEXT: v_writelane_b32 v61, s4, 32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 @@ -181206,10 +180677,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_readfirstlane_b32 s21, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s85, v31 +; SI-NEXT: v_readfirstlane_b32 s81, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s81, v31 +; SI-NEXT: v_readfirstlane_b32 s85, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s97, v31 @@ -181230,45 +180701,45 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_readfirstlane_b32 s58, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s76, v31 +; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s29, v31 +; SI-NEXT: v_readfirstlane_b32 s78, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 ; SI-NEXT: v_writelane_b32 v61, s4, 36 -; SI-NEXT: v_writelane_b32 v61, s54, 37 +; SI-NEXT: v_writelane_b32 v61, s17, 37 ; SI-NEXT: v_writelane_b32 v61, s10, 38 -; SI-NEXT: v_writelane_b32 v61, s67, 39 +; SI-NEXT: v_writelane_b32 v61, s54, 39 ; SI-NEXT: v_writelane_b32 v61, s18, 40 ; SI-NEXT: v_writelane_b32 v61, s61, 41 ; SI-NEXT: v_writelane_b32 v61, s60, 42 -; SI-NEXT: v_writelane_b32 v61, s35, 43 +; SI-NEXT: v_writelane_b32 v61, s67, 43 ; SI-NEXT: v_writelane_b32 v61, s22, 44 ; SI-NEXT: v_writelane_b32 v61, s62, 45 ; SI-NEXT: v_writelane_b32 v61, s63, 46 -; SI-NEXT: v_writelane_b32 v61, s39, 47 -; SI-NEXT: v_writelane_b32 v61, s99, 48 -; SI-NEXT: v_writelane_b32 v61, s95, 49 -; SI-NEXT: v_writelane_b32 v61, s31, 50 -; SI-NEXT: v_writelane_b32 v61, s24, 51 -; SI-NEXT: v_writelane_b32 v61, s38, 52 -; SI-NEXT: v_writelane_b32 v61, s36, 53 -; SI-NEXT: v_writelane_b32 v61, s8, 54 -; SI-NEXT: v_writelane_b32 v61, s27, 55 -; SI-NEXT: v_writelane_b32 v61, s9, 56 -; SI-NEXT: v_writelane_b32 v61, s79, 57 -; SI-NEXT: v_writelane_b32 v61, s13, 58 -; SI-NEXT: v_writelane_b32 v61, s15, 59 -; SI-NEXT: v_writelane_b32 v61, s42, 60 -; SI-NEXT: v_writelane_b32 v61, s43, 61 -; SI-NEXT: v_writelane_b32 v61, s44, 62 -; SI-NEXT: v_writelane_b32 v61, s45, 63 +; SI-NEXT: v_writelane_b32 v61, s77, 47 +; SI-NEXT: v_writelane_b32 v61, s35, 48 +; SI-NEXT: v_writelane_b32 v61, s99, 49 +; SI-NEXT: v_writelane_b32 v61, s38, 50 +; SI-NEXT: v_writelane_b32 v61, s95, 51 +; SI-NEXT: v_writelane_b32 v61, s90, 52 +; SI-NEXT: v_writelane_b32 v61, s31, 53 +; SI-NEXT: v_writelane_b32 v61, s24, 54 +; SI-NEXT: v_writelane_b32 v61, s8, 55 +; SI-NEXT: v_writelane_b32 v61, s27, 56 +; SI-NEXT: v_writelane_b32 v61, s9, 57 +; SI-NEXT: v_writelane_b32 v61, s79, 58 +; SI-NEXT: v_writelane_b32 v61, s13, 59 +; SI-NEXT: v_writelane_b32 v61, s15, 60 +; SI-NEXT: v_writelane_b32 v61, s42, 61 +; SI-NEXT: v_writelane_b32 v61, s43, 62 +; SI-NEXT: v_writelane_b32 v61, s44, 63 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s37, v31 +; SI-NEXT: v_readfirstlane_b32 s75, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s50, v31 @@ -181277,13 +180748,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_readfirstlane_b32 s48, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s19, v31 +; SI-NEXT: v_readfirstlane_b32 s23, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s64, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s17, v31 +; SI-NEXT: v_readfirstlane_b32 s19, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s65, v31 @@ -181295,7 +180766,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_readfirstlane_b32 s70, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s83, v31 +; SI-NEXT: v_readfirstlane_b32 s36, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s49, v31 @@ -181304,108 +180775,108 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_readfirstlane_b32 s80, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s82, v31 +; SI-NEXT: v_readfirstlane_b32 s83, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s87, v31 +; SI-NEXT: v_readfirstlane_b32 s84, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s84, v31 +; SI-NEXT: v_readfirstlane_b32 s51, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s51, v31 +; SI-NEXT: v_readfirstlane_b32 s86, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s86, v31 +; SI-NEXT: v_readfirstlane_b32 s82, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s94, v31 +; SI-NEXT: v_readfirstlane_b32 s98, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s96, v31 +; SI-NEXT: v_readfirstlane_b32 s87, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s68, v31 +; SI-NEXT: v_readfirstlane_b32 s66, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s34, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s77, v31 +; SI-NEXT: v_readfirstlane_b32 s69, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s66, v31 +; SI-NEXT: v_readfirstlane_b32 s30, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s78, v31 +; SI-NEXT: v_readfirstlane_b32 s37, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s53, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s69, v31 +; SI-NEXT: v_readfirstlane_b32 s68, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s30, v31 +; SI-NEXT: v_readfirstlane_b32 s94, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s52, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s75, v31 +; SI-NEXT: v_readfirstlane_b32 s28, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s23, v31 +; SI-NEXT: v_readfirstlane_b32 s26, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s28, v31 +; SI-NEXT: v_readfirstlane_b32 s29, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s26, v31 +; SI-NEXT: v_readfirstlane_b32 s39, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s25, v31 +; SI-NEXT: v_readfirstlane_b32 s76, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: v_writelane_b32 v62, s25, 8 -; SI-NEXT: v_writelane_b32 v62, s28, 9 +; SI-NEXT: v_writelane_b32 v62, s76, 9 +; SI-NEXT: v_writelane_b32 v62, s29, 10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s92, v31 -; SI-NEXT: v_writelane_b32 v62, s92, 10 -; SI-NEXT: v_writelane_b32 v62, s75, 11 -; SI-NEXT: v_writelane_b32 v62, s26, 12 -; SI-NEXT: v_writelane_b32 v62, s30, 13 -; SI-NEXT: v_writelane_b32 v62, s23, 14 -; SI-NEXT: v_writelane_b32 v62, s52, 15 -; SI-NEXT: v_writelane_b32 v62, s64, 16 -; SI-NEXT: v_writelane_b32 v62, s17, 17 -; SI-NEXT: v_writelane_b32 v62, s65, 18 -; SI-NEXT: v_writelane_b32 v62, s70, 19 -; SI-NEXT: v_writelane_b32 v62, s71, 20 -; SI-NEXT: v_writelane_b32 v62, s49, 21 -; SI-NEXT: v_writelane_b32 v62, s83, 22 -; SI-NEXT: v_writelane_b32 v62, s80, 23 -; SI-NEXT: v_writelane_b32 v62, s82, 24 -; SI-NEXT: v_writelane_b32 v62, s84, 25 -; SI-NEXT: v_writelane_b32 v62, s87, 26 -; SI-NEXT: v_writelane_b32 v62, s86, 27 -; SI-NEXT: v_writelane_b32 v62, s51, 28 -; SI-NEXT: v_writelane_b32 v62, s96, 29 -; SI-NEXT: v_writelane_b32 v62, s34, 30 -; SI-NEXT: v_writelane_b32 v62, s94, 31 -; SI-NEXT: v_writelane_b32 v62, s53, 32 -; SI-NEXT: v_writelane_b32 v62, s66, 33 -; SI-NEXT: v_writelane_b32 v62, s68, 34 -; SI-NEXT: v_writelane_b32 v62, s69, 35 -; SI-NEXT: v_writelane_b32 v62, s77, 36 -; SI-NEXT: v_writelane_b32 v62, s78, 37 +; SI-NEXT: v_writelane_b32 v62, s92, 11 +; SI-NEXT: v_writelane_b32 v62, s28, 12 +; SI-NEXT: v_writelane_b32 v62, s39, 13 +; SI-NEXT: v_writelane_b32 v62, s94, 14 +; SI-NEXT: v_writelane_b32 v62, s26, 15 +; SI-NEXT: v_writelane_b32 v62, s52, 16 +; SI-NEXT: v_writelane_b32 v62, s64, 17 +; SI-NEXT: v_writelane_b32 v62, s19, 18 +; SI-NEXT: v_writelane_b32 v62, s65, 19 +; SI-NEXT: v_writelane_b32 v62, s70, 20 +; SI-NEXT: v_writelane_b32 v62, s71, 21 +; SI-NEXT: v_writelane_b32 v62, s49, 22 +; SI-NEXT: v_writelane_b32 v62, s36, 23 +; SI-NEXT: v_writelane_b32 v62, s80, 24 +; SI-NEXT: v_writelane_b32 v62, s83, 25 +; SI-NEXT: v_writelane_b32 v62, s51, 26 +; SI-NEXT: v_writelane_b32 v62, s84, 27 +; SI-NEXT: v_writelane_b32 v62, s82, 28 +; SI-NEXT: v_writelane_b32 v62, s86, 29 +; SI-NEXT: v_writelane_b32 v62, s87, 30 +; SI-NEXT: v_writelane_b32 v62, s34, 31 +; SI-NEXT: v_writelane_b32 v62, s98, 32 +; SI-NEXT: v_writelane_b32 v62, s53, 33 +; SI-NEXT: v_writelane_b32 v62, s30, 34 +; SI-NEXT: v_writelane_b32 v62, s66, 35 +; SI-NEXT: v_writelane_b32 v62, s68, 36 +; SI-NEXT: v_writelane_b32 v62, s69, 37 +; SI-NEXT: v_writelane_b32 v62, s37, 38 ; SI-NEXT: s_cbranch_scc0 .LBB93_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s67, 8 +; SI-NEXT: s_lshl_b32 s5, s54, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_and_b32 s4, s60, 0xff @@ -181413,7 +180884,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s35, 8 +; SI-NEXT: s_lshl_b32 s5, s67, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -181424,9 +180895,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s5, v61, 2 -; SI-NEXT: s_and_b32 s4, s39, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_and_b32 s4, s35, 0xff +; SI-NEXT: s_lshl_b32 s5, s77, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -181437,28 +180907,28 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s99, 0xff -; SI-NEXT: s_lshl_b32 s5, s55, 8 +; SI-NEXT: s_and_b32 s4, s38, 0xff +; SI-NEXT: s_lshl_b32 s5, s99, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_and_b32 s4, s95, 0xff -; SI-NEXT: s_lshl_b32 s5, s93, 8 +; SI-NEXT: s_lshl_b32 s5, s55, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s31, 0xff -; SI-NEXT: s_lshl_b32 s5, s91, 8 +; SI-NEXT: s_and_b32 s4, s91, 0xff +; SI-NEXT: s_lshl_b32 s5, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s88, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_and_b32 s4, s90, 0xff +; SI-NEXT: s_lshl_b32 s5, s96, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s98, 8 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s93, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s36, 0xff -; SI-NEXT: s_lshl_b32 s5, s38, 8 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s31, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: s_and_b32 s4, s27, 0xff @@ -181481,7 +180951,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s5, s44, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_and_b32 s4, s88, 0xff ; SI-NEXT: s_lshl_b32 s5, s74, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 @@ -181493,59 +180963,59 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s5, s46, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_and_b32 s4, s25, 0xff ; SI-NEXT: s_lshl_b32 s5, s57, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: s_and_b32 s4, s92, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_lshl_b32 s5, s76, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: s_and_b32 s4, s39, 0xff +; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s28, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 ; SI-NEXT: s_and_b32 s4, s52, 0xff -; SI-NEXT: s_lshl_b32 s5, s30, 8 +; SI-NEXT: s_lshl_b32 s5, s94, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_and_b32 s4, s69, 0xff +; SI-NEXT: s_and_b32 s4, s68, 0xff ; SI-NEXT: s_lshl_b32 s5, s53, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_and_b32 s4, s78, 0xff -; SI-NEXT: s_lshl_b32 s5, s66, 8 +; SI-NEXT: s_and_b32 s4, s37, 0xff +; SI-NEXT: s_lshl_b32 s5, s30, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_and_b32 s4, s77, 0xff +; SI-NEXT: s_and_b32 s4, s69, 0xff ; SI-NEXT: s_lshl_b32 s5, s34, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_and_b32 s4, s68, 0xff -; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_and_b32 s4, s66, 0xff +; SI-NEXT: s_lshl_b32 s5, s87, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_and_b32 s4, s94, 0xff -; SI-NEXT: s_lshl_b32 s5, s86, 8 +; SI-NEXT: s_and_b32 s4, s98, 0xff +; SI-NEXT: s_lshl_b32 s5, s82, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_and_b32 s4, s51, 0xff -; SI-NEXT: s_lshl_b32 s5, s84, 8 +; SI-NEXT: s_and_b32 s4, s86, 0xff +; SI-NEXT: s_lshl_b32 s5, s51, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_and_b32 s4, s87, 0xff -; SI-NEXT: s_lshl_b32 s5, s82, 8 +; SI-NEXT: s_and_b32 s4, s84, 0xff +; SI-NEXT: s_lshl_b32 s5, s83, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 ; SI-NEXT: s_and_b32 s4, s80, 0xff ; SI-NEXT: s_lshl_b32 s5, s49, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_and_b32 s4, s83, 0xff +; SI-NEXT: s_and_b32 s4, s36, 0xff ; SI-NEXT: s_lshl_b32 s5, s70, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 @@ -181553,24 +181023,24 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s5, s65, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_and_b32 s4, s17, 0xff +; SI-NEXT: s_and_b32 s4, s19, 0xff ; SI-NEXT: s_lshl_b32 s5, s64, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_and_b32 s4, s19, 0xff +; SI-NEXT: s_and_b32 s4, s23, 0xff ; SI-NEXT: s_lshl_b32 s5, s48, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 ; SI-NEXT: s_and_b32 s4, s50, 0xff -; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: s_lshl_b32 s5, s75, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_readlane_b32 s8, v61, 36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 ; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_lshl_b32 s5, s78, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_and_b32 s4, s76, 0xff +; SI-NEXT: s_and_b32 s4, s59, 0xff ; SI-NEXT: s_lshl_b32 s5, s58, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 @@ -181583,10 +181053,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 ; SI-NEXT: s_and_b32 s4, s97, 0xff -; SI-NEXT: s_lshl_b32 s5, s81, 8 +; SI-NEXT: s_lshl_b32 s5, s85, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_and_b32 s4, s85, 0xff +; SI-NEXT: s_and_b32 s4, s81, 0xff ; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 @@ -181604,127 +181074,129 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s5, s89, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_readlane_b32 s66, v61, 33 +; SI-NEXT: v_readlane_b32 s30, v61, 32 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 ; SI-NEXT: s_and_b32 s4, s66, 0xff -; SI-NEXT: s_lshl_b32 s5, s16, 8 +; SI-NEXT: s_lshl_b32 s5, s30, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s53, v61, 32 -; SI-NEXT: v_readlane_b32 s94, v61, 31 +; SI-NEXT: v_readlane_b32 s53, v61, 31 +; SI-NEXT: v_readlane_b32 s98, v61, 30 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 ; SI-NEXT: s_and_b32 s4, s53, 0xff -; SI-NEXT: s_lshl_b32 s5, s94, 8 +; SI-NEXT: s_lshl_b32 s5, s98, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s34, v61, 30 -; SI-NEXT: v_readlane_b32 s96, v61, 29 +; SI-NEXT: v_readlane_b32 s34, v61, 29 +; SI-NEXT: v_readlane_b32 s87, v61, 28 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 ; SI-NEXT: s_and_b32 s4, s34, 0xff -; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_lshl_b32 s5, s87, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s51, v61, 28 ; SI-NEXT: v_readlane_b32 s86, v61, 27 +; SI-NEXT: v_readlane_b32 s82, v61, 26 ; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 -; SI-NEXT: s_and_b32 s4, s51, 0xff -; SI-NEXT: s_lshl_b32 s5, s86, 8 +; SI-NEXT: s_and_b32 s4, s86, 0xff +; SI-NEXT: s_lshl_b32 s5, s82, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s87, v61, 26 ; SI-NEXT: v_readlane_b32 s84, v61, 25 +; SI-NEXT: v_readlane_b32 s51, v61, 24 ; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 -; SI-NEXT: s_and_b32 s4, s87, 0xff -; SI-NEXT: s_lshl_b32 s5, s84, 8 +; SI-NEXT: s_and_b32 s4, s84, 0xff +; SI-NEXT: s_lshl_b32 s5, s51, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s82, v61, 24 -; SI-NEXT: v_readlane_b32 s80, v61, 23 +; SI-NEXT: v_readlane_b32 s83, v61, 23 +; SI-NEXT: v_readlane_b32 s80, v61, 22 ; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 -; SI-NEXT: s_and_b32 s4, s82, 0xff +; SI-NEXT: s_and_b32 s4, s83, 0xff ; SI-NEXT: s_lshl_b32 s5, s80, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s83, v61, 22 -; SI-NEXT: v_readlane_b32 s49, v61, 21 +; SI-NEXT: v_readlane_b32 s36, v61, 21 +; SI-NEXT: v_readlane_b32 s49, v61, 20 ; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 -; SI-NEXT: s_and_b32 s4, s83, 0xff +; SI-NEXT: s_and_b32 s4, s36, 0xff ; SI-NEXT: s_lshl_b32 s5, s49, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s71, v61, 20 -; SI-NEXT: v_readlane_b32 s70, v61, 19 +; SI-NEXT: v_readlane_b32 s71, v61, 19 +; SI-NEXT: v_readlane_b32 s70, v61, 18 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 ; SI-NEXT: s_and_b32 s4, s71, 0xff ; SI-NEXT: s_lshl_b32 s5, s70, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s65, v61, 18 -; SI-NEXT: v_readlane_b32 s54, v61, 17 +; SI-NEXT: v_readlane_b32 s65, v61, 17 +; SI-NEXT: v_readlane_b32 s54, v61, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 ; SI-NEXT: s_and_b32 s4, s65, 0xff ; SI-NEXT: s_lshl_b32 s5, s54, 8 -; SI-NEXT: s_mov_b32 s17, s19 -; SI-NEXT: s_mov_b32 s19, s50 +; SI-NEXT: s_mov_b32 s19, s23 +; SI-NEXT: s_mov_b32 s23, s50 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s67, v61, 16 -; SI-NEXT: v_readlane_b32 s50, v61, 15 +; SI-NEXT: v_readlane_b32 s67, v61, 15 +; SI-NEXT: v_readlane_b32 s50, v61, 14 ; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 ; SI-NEXT: s_and_b32 s4, s67, 0xff ; SI-NEXT: s_lshl_b32 s5, s50, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s64, v61, 14 -; SI-NEXT: v_readlane_b32 s52, v61, 13 +; SI-NEXT: v_readlane_b32 s64, v61, 13 +; SI-NEXT: v_readlane_b32 s52, v61, 12 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 ; SI-NEXT: s_and_b32 s4, s64, 0xff ; SI-NEXT: s_lshl_b32 s5, s52, 8 -; SI-NEXT: s_mov_b32 s23, s48 +; SI-NEXT: s_mov_b32 s26, s48 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s35, v61, 12 -; SI-NEXT: v_readlane_b32 s48, v61, 11 +; SI-NEXT: v_readlane_b32 s35, v61, 11 +; SI-NEXT: v_readlane_b32 s48, v61, 10 ; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 ; SI-NEXT: s_and_b32 s4, s35, 0xff ; SI-NEXT: s_lshl_b32 s5, s48, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s30, v61, 10 -; SI-NEXT: v_readlane_b32 s39, v61, 9 +; SI-NEXT: v_readlane_b32 s94, v61, 9 +; SI-NEXT: v_readlane_b32 s39, v61, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 +; SI-NEXT: s_and_b32 s4, s94, 0xff +; SI-NEXT: s_lshl_b32 s5, s39, 8 +; SI-NEXT: s_mov_b32 s28, s75 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s37, v61, 7 +; SI-NEXT: v_readlane_b32 s75, v61, 6 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s30, 0xff -; SI-NEXT: s_lshl_b32 s5, s39, 8 -; SI-NEXT: s_mov_b32 s26, s37 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s37, v61, 8 -; SI-NEXT: v_readlane_b32 s75, v61, 7 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 ; SI-NEXT: s_and_b32 s4, s37, 0xff ; SI-NEXT: s_lshl_b32 s5, s75, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s92, v61, 6 -; SI-NEXT: v_readlane_b32 s77, v61, 5 +; SI-NEXT: v_readlane_b32 s92, v61, 5 +; SI-NEXT: v_readlane_b32 s77, v61, 4 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s92, 0xff ; SI-NEXT: s_lshl_b32 s5, s77, 8 -; SI-NEXT: s_mov_b32 s28, s29 -; SI-NEXT: s_mov_b32 s29, s76 +; SI-NEXT: s_mov_b32 s29, s78 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s78, v61, 4 -; SI-NEXT: v_readlane_b32 s76, v61, 3 +; SI-NEXT: v_readlane_b32 s78, v61, 3 +; SI-NEXT: v_readlane_b32 s76, v61, 2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s78, 0xff ; SI-NEXT: s_lshl_b32 s5, s76, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_mov_b32 s99, s55 -; SI-NEXT: s_mov_b32 s20, s88 -; SI-NEXT: s_mov_b32 s24, s98 +; SI-NEXT: s_mov_b32 s95, s55 +; SI-NEXT: s_mov_b32 s16, s20 +; SI-NEXT: s_mov_b32 s25, s59 ; SI-NEXT: s_mov_b32 s59, s58 ; SI-NEXT: s_mov_b32 s56, s47 ; SI-NEXT: s_mov_b32 s46, s41 ; SI-NEXT: s_mov_b32 s12, s11 ; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_mov_b32 s7, s97 -; SI-NEXT: s_mov_b32 s97, s81 -; SI-NEXT: s_mov_b32 s81, s85 -; SI-NEXT: s_mov_b32 s6, s40 +; SI-NEXT: s_mov_b32 s97, s85 +; SI-NEXT: s_mov_b32 s85, s81 +; SI-NEXT: s_mov_b32 s88, s40 ; SI-NEXT: s_mov_b32 s40, s72 ; SI-NEXT: s_mov_b32 s45, s73 ; SI-NEXT: s_mov_b32 s15, s89 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_mov_b32 s55, s93 -; SI-NEXT: s_mov_b32 s95, s91 -; SI-NEXT: s_mov_b32 s31, s90 +; SI-NEXT: s_mov_b32 s38, s6 +; SI-NEXT: s_mov_b32 s55, s91 +; SI-NEXT: s_mov_b32 s99, s96 +; SI-NEXT: s_mov_b32 s90, s93 +; SI-NEXT: s_mov_b32 s6, s8 ; SI-NEXT: s_cbranch_execnz .LBB93_3 ; SI-NEXT: .LBB93_2: ; %cmp.true ; SI-NEXT: s_add_i32 s4, s78, 3 @@ -181739,7 +181211,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff ; SI-NEXT: s_lshl_b32 vcc_hi, s75, 8 ; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo -; SI-NEXT: s_add_i32 vcc_hi, s30, 3 +; SI-NEXT: s_add_i32 vcc_hi, s94, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s60, s39, 8 ; SI-NEXT: s_or_b32 s60, s60, vcc_hi @@ -181763,32 +181235,32 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s73, s70, 8 ; SI-NEXT: s_or_b32 s73, s73, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s83, 3 +; SI-NEXT: s_add_i32 vcc_hi, s36, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s74, s49, 8 ; SI-NEXT: s_or_b32 s74, s74, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s82, 3 +; SI-NEXT: s_add_i32 vcc_hi, s83, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s75, s80, 8 ; SI-NEXT: s_or_b32 s75, s75, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s87, 3 +; SI-NEXT: s_add_i32 vcc_hi, s84, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s76, s84, 8 +; SI-NEXT: s_lshl_b32 s76, s51, 8 ; SI-NEXT: s_or_b32 s76, s76, vcc_hi -; SI-NEXT: s_add_i32 vcc_hi, s51, 3 +; SI-NEXT: s_add_i32 vcc_hi, s86, 3 ; SI-NEXT: s_add_i32 s93, s53, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff -; SI-NEXT: s_lshl_b32 s77, s86, 8 +; SI-NEXT: s_lshl_b32 s77, s82, 8 ; SI-NEXT: s_add_i32 s89, s34, 3 ; SI-NEXT: s_and_b32 s93, s93, 0xff -; SI-NEXT: s_lshl_b32 s78, s94, 8 +; SI-NEXT: s_lshl_b32 s78, s98, 8 ; SI-NEXT: s_add_i32 s34, s66, 3 ; SI-NEXT: s_or_b32 s77, s77, vcc_hi ; SI-NEXT: s_and_b32 s89, s89, 0xff -; SI-NEXT: s_lshl_b32 vcc_hi, s96, 8 +; SI-NEXT: s_lshl_b32 vcc_hi, s87, 8 ; SI-NEXT: s_or_b32 s22, s78, s93 ; SI-NEXT: s_and_b32 s93, s34, 0xff -; SI-NEXT: s_lshl_b32 s92, s16, 8 +; SI-NEXT: s_lshl_b32 s92, s30, 8 ; SI-NEXT: s_add_i32 s53, s68, 3 ; SI-NEXT: s_or_b32 s89, vcc_hi, s89 ; SI-NEXT: s_or_b32 s92, s92, s93 @@ -181798,11 +181270,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_or_b32 s93, vcc_hi, s93 ; SI-NEXT: s_and_b32 vcc_hi, s66, 0xff ; SI-NEXT: s_lshl_b32 s34, s45, 8 -; SI-NEXT: s_add_i32 s68, s6, 3 +; SI-NEXT: s_add_i32 s68, s88, 3 ; SI-NEXT: s_or_b32 vcc_hi, s34, vcc_hi ; SI-NEXT: s_and_b32 s34, s68, 0xff ; SI-NEXT: s_lshl_b32 s39, s40, 8 -; SI-NEXT: s_add_i32 s69, s81, 3 +; SI-NEXT: s_add_i32 s69, s85, 3 ; SI-NEXT: s_or_b32 s34, s39, s34 ; SI-NEXT: s_and_b32 s39, s69, 0xff ; SI-NEXT: s_lshl_b32 s52, s21, 8 @@ -181818,209 +181290,208 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_or_b32 s53, s64, s53 ; SI-NEXT: s_and_b32 s64, s97, 0xff ; SI-NEXT: s_lshl_b32 s66, s46, 8 -; SI-NEXT: s_add_i32 s21, s29, 3 +; SI-NEXT: s_add_i32 s21, s25, 3 ; SI-NEXT: s_or_b32 s64, s66, s64 ; SI-NEXT: s_and_b32 s21, s21, 0xff ; SI-NEXT: s_lshl_b32 s66, s59, 8 -; SI-NEXT: s_add_i32 s25, s8, 3 +; SI-NEXT: s_add_i32 s25, s6, 3 ; SI-NEXT: s_or_b32 s66, s66, s21 ; SI-NEXT: s_and_b32 s21, s25, 0xff -; SI-NEXT: s_lshl_b32 s6, s28, 8 -; SI-NEXT: s_add_i32 s29, s19, 3 +; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: s_add_i32 s29, s23, 3 ; SI-NEXT: s_or_b32 s67, s6, s21 ; SI-NEXT: s_and_b32 s6, s29, 0xff -; SI-NEXT: s_lshl_b32 s18, s26, 8 -; SI-NEXT: s_add_i32 s28, s17, 3 +; SI-NEXT: s_lshl_b32 s18, s28, 8 +; SI-NEXT: s_add_i32 s28, s19, 3 ; SI-NEXT: s_or_b32 s68, s18, s6 ; SI-NEXT: s_and_b32 s6, s28, 0xff -; SI-NEXT: s_lshl_b32 s18, s23, 8 +; SI-NEXT: s_lshl_b32 s18, s26, 8 ; SI-NEXT: s_or_b32 s69, s18, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 17 +; SI-NEXT: v_readlane_b32 s6, v62, 18 +; SI-NEXT: s_mov_b32 s91, s16 ; SI-NEXT: s_add_i32 s7, s6, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 15 +; SI-NEXT: v_readlane_b32 s16, v62, 16 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v62, 16 +; SI-NEXT: v_readlane_b32 s7, v62, 17 ; SI-NEXT: s_add_i32 s27, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 13 +; SI-NEXT: v_readlane_b32 s16, v62, 14 ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_lshl_b32 s23, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 14 -; SI-NEXT: s_mov_b32 s91, s24 +; SI-NEXT: v_readlane_b32 s16, v62, 15 ; SI-NEXT: s_or_b32 s70, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 20 +; SI-NEXT: v_readlane_b32 s6, v62, 21 ; SI-NEXT: s_add_i32 s24, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 11 +; SI-NEXT: v_readlane_b32 s16, v62, 12 ; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 18 +; SI-NEXT: v_readlane_b32 s7, v62, 19 ; SI-NEXT: s_lshl_b32 s19, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 12 -; SI-NEXT: s_mov_b32 s90, s20 +; SI-NEXT: v_readlane_b32 s16, v62, 13 ; SI-NEXT: s_and_b32 s6, s11, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s20, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 9 +; SI-NEXT: v_readlane_b32 s16, v62, 10 ; SI-NEXT: s_or_b32 s71, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 22 +; SI-NEXT: v_readlane_b32 s6, v62, 23 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s17, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 10 +; SI-NEXT: v_readlane_b32 s16, v62, 11 ; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 19 +; SI-NEXT: v_readlane_b32 s7, v62, 20 ; SI-NEXT: s_or_b32 s17, s17, s20 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s20, v62, 8 +; SI-NEXT: v_readlane_b32 s20, v62, 9 ; SI-NEXT: s_and_b32 s6, s12, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 8 ; SI-NEXT: s_or_b32 s81, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 23 +; SI-NEXT: v_readlane_b32 s6, v62, 24 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_or_b32 s16, s20, s16 -; SI-NEXT: v_readlane_b32 s20, v62, 7 +; SI-NEXT: v_readlane_b32 s20, v62, 8 ; SI-NEXT: s_add_i32 s14, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 21 +; SI-NEXT: v_readlane_b32 s7, v62, 22 ; SI-NEXT: s_or_b32 s19, s19, s24 ; SI-NEXT: s_add_i32 s98, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v62, 6 +; SI-NEXT: v_readlane_b32 s24, v62, 7 ; SI-NEXT: s_and_b32 s6, s14, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s20, s98, 0xff ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_or_b32 s83, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 26 +; SI-NEXT: v_readlane_b32 s6, v62, 27 ; SI-NEXT: s_and_b32 s27, s27, 0xff ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v62, 5 +; SI-NEXT: v_readlane_b32 s24, v62, 6 ; SI-NEXT: s_add_i32 s41, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 24 +; SI-NEXT: v_readlane_b32 s7, v62, 25 ; SI-NEXT: s_or_b32 s23, s23, s27 ; SI-NEXT: s_add_i32 s86, s24, 3 -; SI-NEXT: v_readlane_b32 s27, v62, 4 +; SI-NEXT: v_readlane_b32 s27, v62, 5 ; SI-NEXT: s_and_b32 s6, s41, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s24, s86, 0xff ; SI-NEXT: s_lshl_b32 s27, s27, 8 ; SI-NEXT: s_or_b32 s85, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 28 +; SI-NEXT: v_readlane_b32 s6, v62, 29 ; SI-NEXT: s_or_b32 s24, s27, s24 -; SI-NEXT: v_readlane_b32 s27, v62, 3 +; SI-NEXT: v_readlane_b32 s27, v62, 4 ; SI-NEXT: s_add_i32 s46, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 25 +; SI-NEXT: v_readlane_b32 s7, v62, 26 ; SI-NEXT: s_add_i32 s12, s73, 0x300 ; SI-NEXT: s_add_i32 s82, s27, 3 -; SI-NEXT: v_readlane_b32 s73, v62, 2 +; SI-NEXT: v_readlane_b32 s73, v62, 3 ; SI-NEXT: s_and_b32 s6, s46, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s27, s82, 0xff ; SI-NEXT: s_lshl_b32 s73, s73, 8 ; SI-NEXT: s_or_b32 s96, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 31 +; SI-NEXT: v_readlane_b32 s6, v62, 32 ; SI-NEXT: s_or_b32 s27, s73, s27 -; SI-NEXT: v_readlane_b32 s73, v62, 1 +; SI-NEXT: v_readlane_b32 s73, v62, 2 ; SI-NEXT: s_add_i32 s47, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 27 +; SI-NEXT: v_readlane_b32 s7, v62, 28 ; SI-NEXT: s_add_i32 s13, s74, 0x300 ; SI-NEXT: s_add_i32 s65, s73, 3 -; SI-NEXT: v_readlane_b32 s74, v62, 0 +; SI-NEXT: v_readlane_b32 s74, v62, 1 ; SI-NEXT: s_and_b32 s6, s47, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s73, s65, 0xff ; SI-NEXT: s_lshl_b32 s74, s74, 8 ; SI-NEXT: s_or_b32 s97, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 34 +; SI-NEXT: v_readlane_b32 s6, v62, 35 ; SI-NEXT: s_or_b32 s73, s74, s73 -; SI-NEXT: v_readlane_b32 s74, v61, 63 +; SI-NEXT: v_readlane_b32 s74, v62, 0 ; SI-NEXT: s_add_i32 s56, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 29 +; SI-NEXT: v_readlane_b32 s7, v62, 30 ; SI-NEXT: s_add_i32 s14, s75, 0x300 ; SI-NEXT: s_add_i32 s54, s74, 3 -; SI-NEXT: v_readlane_b32 s75, v61, 62 +; SI-NEXT: v_readlane_b32 s75, v61, 63 ; SI-NEXT: s_and_b32 s6, s56, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s74, s54, 0xff ; SI-NEXT: s_lshl_b32 s75, s75, 8 ; SI-NEXT: s_or_b32 s63, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 36 +; SI-NEXT: v_readlane_b32 s6, v62, 37 ; SI-NEXT: s_or_b32 s74, s75, s74 -; SI-NEXT: v_readlane_b32 s75, v61, 61 +; SI-NEXT: v_readlane_b32 s75, v61, 62 ; SI-NEXT: s_add_i32 s58, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 30 +; SI-NEXT: v_readlane_b32 s7, v62, 31 ; SI-NEXT: s_add_i32 s15, s76, 0x300 ; SI-NEXT: s_add_i32 s50, s75, 3 -; SI-NEXT: v_readlane_b32 s76, v61, 60 +; SI-NEXT: v_readlane_b32 s76, v61, 61 ; SI-NEXT: s_and_b32 s6, s58, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s75, s50, 0xff ; SI-NEXT: s_lshl_b32 s76, s76, 8 ; SI-NEXT: s_or_b32 s79, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 37 +; SI-NEXT: v_readlane_b32 s6, v62, 38 ; SI-NEXT: s_or_b32 s75, s76, s75 -; SI-NEXT: v_readlane_b32 s76, v61, 59 +; SI-NEXT: v_readlane_b32 s76, v61, 60 ; SI-NEXT: s_add_i32 s59, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 33 +; SI-NEXT: v_readlane_b32 s7, v62, 34 ; SI-NEXT: s_add_i32 s18, s77, 0x300 ; SI-NEXT: s_add_i32 s48, s76, 3 -; SI-NEXT: v_readlane_b32 s77, v61, 58 +; SI-NEXT: v_readlane_b32 s77, v61, 59 ; SI-NEXT: s_and_b32 s6, s59, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s76, s48, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 8 ; SI-NEXT: s_or_b32 s78, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 35 +; SI-NEXT: v_readlane_b32 s6, v62, 36 ; SI-NEXT: s_or_b32 s76, s77, s76 -; SI-NEXT: v_readlane_b32 s77, v61, 57 +; SI-NEXT: v_readlane_b32 s77, v61, 58 ; SI-NEXT: s_add_i32 s57, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 32 +; SI-NEXT: v_readlane_b32 s7, v62, 33 ; SI-NEXT: s_add_i32 s11, s72, 0x300 ; SI-NEXT: s_add_i32 s72, s79, 0x300 ; SI-NEXT: s_add_i32 s37, s77, 3 -; SI-NEXT: v_readlane_b32 s79, v61, 56 +; SI-NEXT: v_readlane_b32 s79, v61, 57 ; SI-NEXT: s_and_b32 s6, s57, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s77, s37, 0xff ; SI-NEXT: s_lshl_b32 s79, s79, 8 ; SI-NEXT: s_or_b32 s88, s7, s6 ; SI-NEXT: s_or_b32 s77, s79, s77 -; SI-NEXT: v_readlane_b32 s79, v61, 55 +; SI-NEXT: v_readlane_b32 s79, v61, 56 ; SI-NEXT: s_add_i32 s21, s89, 0x300 ; SI-NEXT: s_add_i32 s89, s88, 0x300 ; SI-NEXT: s_add_i32 s35, s79, 3 -; SI-NEXT: v_readlane_b32 s88, v61, 54 +; SI-NEXT: v_readlane_b32 s88, v61, 55 ; SI-NEXT: s_and_b32 s79, s35, 0xff ; SI-NEXT: s_lshl_b32 s88, s88, 8 ; SI-NEXT: s_or_b32 s79, s88, s79 -; SI-NEXT: v_readlane_b32 s88, v61, 53 +; SI-NEXT: v_readlane_b32 s88, v61, 54 ; SI-NEXT: s_add_i32 s25, s92, 0x300 ; SI-NEXT: s_add_i32 s30, s88, 3 -; SI-NEXT: v_readlane_b32 s92, v61, 52 +; SI-NEXT: v_readlane_b32 s92, v61, 53 ; SI-NEXT: s_and_b32 s88, s30, 0xff ; SI-NEXT: s_lshl_b32 s92, s92, 8 +; SI-NEXT: s_add_i32 s94, s91, 3 +; SI-NEXT: s_lshl_b32 s91, s90, 8 +; SI-NEXT: v_readlane_b32 s90, v61, 52 ; SI-NEXT: s_or_b32 s88, s92, s88 -; SI-NEXT: v_readlane_b32 s92, v61, 51 -; SI-NEXT: s_add_i32 s94, s92, 3 ; SI-NEXT: s_and_b32 s92, s94, 0xff -; SI-NEXT: s_lshl_b32 s91, s91, 8 ; SI-NEXT: s_add_i32 s90, s90, 3 ; SI-NEXT: s_or_b32 s91, s91, s92 ; SI-NEXT: s_and_b32 s90, s90, 0xff -; SI-NEXT: s_lshl_b32 s92, s31, 8 +; SI-NEXT: s_lshl_b32 s92, s99, 8 ; SI-NEXT: s_or_b32 s90, s92, s90 -; SI-NEXT: v_readlane_b32 s92, v61, 50 -; SI-NEXT: s_add_i32 s92, s92, 3 +; SI-NEXT: s_add_i32 s92, s55, 3 ; SI-NEXT: s_add_i32 s26, s93, 0x300 ; SI-NEXT: s_and_b32 s92, s92, 0xff -; SI-NEXT: s_lshl_b32 s93, s95, 8 +; SI-NEXT: s_lshl_b32 s93, s38, 8 ; SI-NEXT: s_or_b32 s92, s93, s92 -; SI-NEXT: v_readlane_b32 s93, v61, 49 +; SI-NEXT: v_readlane_b32 s93, v61, 51 ; SI-NEXT: s_add_i32 s93, s93, 3 ; SI-NEXT: s_and_b32 s93, s93, 0xff -; SI-NEXT: s_lshl_b32 s94, s55, 8 +; SI-NEXT: s_lshl_b32 s94, s95, 8 ; SI-NEXT: s_or_b32 s93, s94, s93 -; SI-NEXT: v_readlane_b32 s94, v61, 48 +; SI-NEXT: v_readlane_b32 s94, v61, 50 ; SI-NEXT: s_add_i32 s94, s94, 3 +; SI-NEXT: v_readlane_b32 s95, v61, 49 ; SI-NEXT: s_and_b32 s94, s94, 0xff -; SI-NEXT: s_lshl_b32 s95, s99, 8 +; SI-NEXT: s_lshl_b32 s95, s95, 8 ; SI-NEXT: s_or_b32 s94, s95, s94 ; SI-NEXT: v_readlane_b32 s95, v61, 1 ; SI-NEXT: s_add_i32 s95, s95, 3 @@ -182028,10 +181499,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_add_i32 s6, vcc_lo, 0x300 ; SI-NEXT: s_and_b32 s95, s95, 0xff ; SI-NEXT: s_lshl_b32 vcc_lo, s30, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 47 +; SI-NEXT: v_readlane_b32 s30, v61, 48 ; SI-NEXT: s_or_b32 s95, vcc_lo, s95 ; SI-NEXT: s_add_i32 vcc_lo, s30, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 2 +; SI-NEXT: v_readlane_b32 s30, v61, 47 ; SI-NEXT: s_add_i32 s28, vcc_hi, 0x300 ; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff ; SI-NEXT: s_lshl_b32 vcc_hi, s30, 8 @@ -182183,24 +181654,27 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v56, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v58, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: .LBB93_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 ; SI-NEXT: v_readlane_b32 s96, v63, 32 @@ -182438,20 +181912,17 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x74, v0 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -182479,69 +181950,70 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: .LBB93_4: ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_mov_b32 s17, s19 +; SI-NEXT: s_mov_b32 s19, s23 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_mov_b32 s19, s50 +; SI-NEXT: s_mov_b32 s23, s50 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_mov_b32 s23, s48 -; SI-NEXT: s_mov_b32 s26, s37 -; SI-NEXT: s_mov_b32 s28, s29 -; SI-NEXT: s_mov_b32 s29, s76 +; SI-NEXT: s_mov_b32 s26, s48 +; SI-NEXT: s_mov_b32 s28, s75 +; SI-NEXT: s_mov_b32 s29, s78 +; SI-NEXT: s_mov_b32 s25, s59 ; SI-NEXT: s_mov_b32 s59, s58 ; SI-NEXT: s_mov_b32 s56, s47 ; SI-NEXT: s_mov_b32 s46, s41 ; SI-NEXT: s_mov_b32 s12, s11 ; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_mov_b32 s7, s97 -; SI-NEXT: s_mov_b32 s97, s81 -; SI-NEXT: s_mov_b32 s81, s85 -; SI-NEXT: s_mov_b32 s6, s40 +; SI-NEXT: s_mov_b32 s97, s85 +; SI-NEXT: s_mov_b32 s85, s81 +; SI-NEXT: s_mov_b32 s88, s40 ; SI-NEXT: s_mov_b32 s40, s72 ; SI-NEXT: s_mov_b32 s45, s73 ; SI-NEXT: s_mov_b32 s15, s89 -; SI-NEXT: s_mov_b32 s24, s98 -; SI-NEXT: s_mov_b32 s20, s88 -; SI-NEXT: s_mov_b32 s99, s55 +; SI-NEXT: s_mov_b32 s16, s20 +; SI-NEXT: s_mov_b32 s95, s55 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_readlane_b32 s75, v61, 7 -; SI-NEXT: v_readlane_b32 s76, v61, 3 -; SI-NEXT: v_readlane_b32 s77, v61, 5 -; SI-NEXT: v_readlane_b32 s78, v61, 4 -; SI-NEXT: v_readlane_b32 s92, v61, 6 -; SI-NEXT: v_readlane_b32 s39, v61, 9 -; SI-NEXT: v_readlane_b32 s37, v61, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 10 -; SI-NEXT: v_readlane_b32 s48, v61, 11 -; SI-NEXT: v_readlane_b32 s52, v61, 13 -; SI-NEXT: v_readlane_b32 s35, v61, 12 -; SI-NEXT: v_readlane_b32 s50, v61, 15 -; SI-NEXT: v_readlane_b32 s64, v61, 14 -; SI-NEXT: v_readlane_b32 s54, v61, 17 -; SI-NEXT: v_readlane_b32 s67, v61, 16 -; SI-NEXT: v_readlane_b32 s65, v61, 18 -; SI-NEXT: v_readlane_b32 s70, v61, 19 -; SI-NEXT: v_readlane_b32 s49, v61, 21 -; SI-NEXT: v_readlane_b32 s71, v61, 20 -; SI-NEXT: v_readlane_b32 s80, v61, 23 -; SI-NEXT: v_readlane_b32 s83, v61, 22 +; SI-NEXT: v_readlane_b32 s75, v61, 6 +; SI-NEXT: v_readlane_b32 s76, v61, 2 +; SI-NEXT: v_readlane_b32 s77, v61, 4 +; SI-NEXT: v_readlane_b32 s78, v61, 3 +; SI-NEXT: v_readlane_b32 s92, v61, 5 +; SI-NEXT: v_readlane_b32 s39, v61, 8 +; SI-NEXT: v_readlane_b32 s37, v61, 7 +; SI-NEXT: v_readlane_b32 s94, v61, 9 +; SI-NEXT: v_readlane_b32 s48, v61, 10 +; SI-NEXT: v_readlane_b32 s52, v61, 12 +; SI-NEXT: v_readlane_b32 s35, v61, 11 +; SI-NEXT: v_readlane_b32 s50, v61, 14 +; SI-NEXT: v_readlane_b32 s64, v61, 13 +; SI-NEXT: v_readlane_b32 s54, v61, 16 +; SI-NEXT: v_readlane_b32 s67, v61, 15 +; SI-NEXT: v_readlane_b32 s65, v61, 17 +; SI-NEXT: v_readlane_b32 s70, v61, 18 +; SI-NEXT: v_readlane_b32 s49, v61, 20 +; SI-NEXT: v_readlane_b32 s71, v61, 19 +; SI-NEXT: v_readlane_b32 s80, v61, 22 +; SI-NEXT: v_readlane_b32 s36, v61, 21 +; SI-NEXT: v_readlane_b32 s51, v61, 24 +; SI-NEXT: v_readlane_b32 s83, v61, 23 ; SI-NEXT: v_readlane_b32 s84, v61, 25 -; SI-NEXT: v_readlane_b32 s82, v61, 24 -; SI-NEXT: v_readlane_b32 s87, v61, 26 +; SI-NEXT: v_readlane_b32 s82, v61, 26 +; SI-NEXT: v_readlane_b32 s87, v61, 28 ; SI-NEXT: v_readlane_b32 s86, v61, 27 -; SI-NEXT: v_readlane_b32 s96, v61, 29 -; SI-NEXT: v_readlane_b32 s51, v61, 28 -; SI-NEXT: s_mov_b32 s55, s93 -; SI-NEXT: s_mov_b32 s95, s91 -; SI-NEXT: v_readlane_b32 s94, v61, 31 -; SI-NEXT: s_mov_b32 s31, s90 -; SI-NEXT: v_readlane_b32 s34, v61, 30 -; SI-NEXT: v_readlane_b32 s53, v61, 32 +; SI-NEXT: s_mov_b32 s38, s6 +; SI-NEXT: v_readlane_b32 s98, v61, 30 +; SI-NEXT: s_mov_b32 s55, s91 +; SI-NEXT: s_mov_b32 s99, s96 +; SI-NEXT: v_readlane_b32 s30, v61, 32 +; SI-NEXT: s_mov_b32 s90, s93 +; SI-NEXT: v_readlane_b32 s34, v61, 29 +; SI-NEXT: v_readlane_b32 s53, v61, 31 ; SI-NEXT: v_readlane_b32 s66, v61, 33 ; SI-NEXT: v_readlane_b32 s68, v61, 34 ; SI-NEXT: v_readlane_b32 s69, v61, 35 -; SI-NEXT: v_readlane_b32 s8, v61, 36 +; SI-NEXT: v_readlane_b32 s6, v61, 36 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; kill: killed $vgpr1 @@ -182598,8 +182070,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -182624,19 +182096,19 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill @@ -182666,10 +182138,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v44, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v29 ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 @@ -182678,46 +182150,42 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v41, 8, v26 +; VI-NEXT: v_lshlrev_b32_e32 v40, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v30 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v31 +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 @@ -182726,55 +182194,40 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v36 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v38 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -182785,805 +182238,824 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:36 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 -; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB93_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff ; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_and_b32 s4, s20, 0xff +; VI-NEXT: s_lshl_b32 s5, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff ; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s24, 0xff +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s8, s4, s5 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v2, v8 -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v44, v8 +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v45, v10 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v3, v10 +; VI-NEXT: v_or_b32_sdwa v1, v1, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v38, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v39, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_or_b32_sdwa v1, v63, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_or_b32_sdwa v0, v39, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_or_b32_sdwa v1, v40, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v42, v43 -; VI-NEXT: v_mov_b32_e32 v43, v37 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v47, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v47, v54 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_or_b32_sdwa v0, v41, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v51, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v52, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v35, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v61, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v58, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v36, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v34, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v55, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v28, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v25, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v58, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v26, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v29, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v46, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v28, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v57, v38 +; VI-NEXT: v_or_b32_sdwa v0, v38, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v40, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v59, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v51, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v41 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v46, v1 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v56, v1 -; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v47, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v39 -; VI-NEXT: v_mov_b32_e32 v54, v33 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v57, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v62 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v52, v60 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v54, v63 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v53, v35 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 -; VI-NEXT: s_and_b32 s4, s16, 0xff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff -; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_branch .LBB93_3 ; VI-NEXT: .LBB93_2: -; VI-NEXT: v_mov_b32_e32 v47, v54 -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v58, v7 -; VI-NEXT: v_mov_b32_e32 v57, v5 -; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: v_mov_b32_e32 v57, v38 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB93_3: ; %Flow -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; VI-NEXT: s_cbranch_vccnz .LBB93_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s5, s27, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s9, s19, 8 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v29, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 -; VI-NEXT: v_or_b32_sdwa v30, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 -; VI-NEXT: v_or_b32_sdwa v28, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v44, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v27, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v63 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v40, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v26, v61, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v34, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 -; VI-NEXT: v_or_b32_sdwa v26, v26, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v28, v53, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v21, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v21 -; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v14, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v24, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v27, v49, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v32, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v15, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v23, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v26, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 +; VI-NEXT: v_or_b32_sdwa v37, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: v_or_b32_sdwa v26, v26, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v38, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v24, v36, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v46 +; VI-NEXT: v_or_b32_sdwa v39, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v15 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v14 +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 +; VI-NEXT: v_or_b32_sdwa v24, v24, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v25, v25, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v27, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v28, v28, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v61, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v61 -; VI-NEXT: v_or_b32_sdwa v23, v23, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v23, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v13 +; VI-NEXT: v_or_b32_sdwa v29, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v48, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: v_or_b32_sdwa v23, v23, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v36, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 -; VI-NEXT: v_or_b32_sdwa v22, v22, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v22, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v49, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 +; VI-NEXT: v_or_b32_sdwa v22, v22, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v21, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v63, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v38, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 -; VI-NEXT: v_or_b32_sdwa v21, v63, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v50, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v20, v45, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 +; VI-NEXT: v_or_b32_sdwa v51, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v40 +; VI-NEXT: v_or_b32_sdwa v19, v43, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v52, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v52 +; VI-NEXT: v_or_b32_sdwa v19, v19, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v20, v20, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v21, v21, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 -; VI-NEXT: v_or_b32_sdwa v39, v45, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v19, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v48, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 -; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v62, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v62 -; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v53, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x300, v53 +; VI-NEXT: v_or_b32_sdwa v18, v18, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v51 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v50 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v49, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 -; VI-NEXT: v_or_b32_sdwa v15, v15, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_or_b32_sdwa v54, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x300, v54 +; VI-NEXT: v_or_b32_sdwa v17, v17, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v17 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v16, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 -; VI-NEXT: v_or_b32_sdwa v14, v14, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v2 -; VI-NEXT: v_or_b32_sdwa v29, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: v_or_b32_sdwa v55, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x300, v55 +; VI-NEXT: v_or_b32_sdwa v16, v16, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v16 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v13, v59, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v52, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v52 -; VI-NEXT: v_or_b32_sdwa v13, v13, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v44 -; VI-NEXT: v_or_b32_sdwa v28, v28, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v40, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v41, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v41 +; VI-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v1 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_or_b32_sdwa v42, v58, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v42 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v54, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v54, vcc, 0x300, v54 -; VI-NEXT: v_or_b32_sdwa v12, v12, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_or_b32_sdwa v12, v47, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_or_b32_sdwa v43, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v40, vcc, 0x300, v43 +; VI-NEXT: v_or_b32_sdwa v12, v12, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v50, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_or_b32_sdwa v11, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v44, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v44 +; VI-NEXT: v_or_b32_sdwa v11, v11, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v53, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v10, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v55, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v45, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v10, v10, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v9, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v10 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 -; VI-NEXT: v_or_b32_sdwa v49, v16, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v53, vcc, 0x300, v40 -; VI-NEXT: v_or_b32_sdwa v27, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v46, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v46 +; VI-NEXT: v_or_b32_sdwa v9, v9, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v8, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v43, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v43 -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v11 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v41 -; VI-NEXT: v_or_b32_sdwa v17, v17, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v11, v50, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v49 -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v0 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v30, v30, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v47, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v44, vcc, 0x300, v47 +; VI-NEXT: v_or_b32_sdwa v8, v8, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v7, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v56, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v56 ; VI-NEXT: v_or_b32_sdwa v7, v7, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v6, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_or_b32_sdwa v57, v32, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v57 ; VI-NEXT: v_or_b32_sdwa v6, v6, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v5, v32, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v47, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v58, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v3 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v47, vcc, 0x300, v47 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v47, vcc, 0x300, v58 ; VI-NEXT: v_or_b32_sdwa v5, v5, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v30, v30, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 -; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v56, vcc, 0x300, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v4, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v56, vcc, 3, v56 -; VI-NEXT: v_or_b32_sdwa v56, v57, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v56, s4, v56 -; VI-NEXT: s_and_b32 s4, s26, 0xff -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_and_b32 s5, s24, 0xff -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_or_b32 s7, s8, s7 -; VI-NEXT: s_and_b32 s8, s18, 0xff -; VI-NEXT: s_or_b32 s8, s9, s8 -; VI-NEXT: s_and_b32 s9, s16, 0xff -; VI-NEXT: s_or_b32 s9, s10, s9 -; VI-NEXT: s_addk_i32 s5, 0x300 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s8, s8, 0x3000000 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v56 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v57, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 ; VI-NEXT: .LBB93_5: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -183623,35 +183095,31 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 @@ -183662,268 +183130,284 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v29 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 ; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v56 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v52 ; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v45 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v51 ; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v55 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v53 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v20 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v26 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v28 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 -; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v24 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(29) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:248 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 -; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v37 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:312 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:328 -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:36 ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:44 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v1 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:100 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:68 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:108 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(55) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(60) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB93_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -183931,722 +183415,725 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v3, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s21, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v57, v5 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v34, v35 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s4, s5 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v46, v32 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v17, v45, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v45, v59 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v55, v22 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v16, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v47, v32 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_mov_b32_e32 v33, v35 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v51, v57 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v49, v39 -; GFX9-NEXT: v_mov_b32_e32 v59, v44 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v58, v50 -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v54, v63 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v41, v39 +; GFX9-NEXT: v_mov_b32_e32 v39, v54 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v52, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v54, v32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v29 +; GFX9-NEXT: v_mov_b32_e32 v34, v37 +; GFX9-NEXT: v_mov_b32_e32 v37, v53 +; GFX9-NEXT: v_mov_b32_e32 v53, v36 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v48, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v36, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v62, v61 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v59, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v57, v35 -; GFX9-NEXT: v_mov_b32_e32 v35, v38 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: s_branch .LBB93_3 ; GFX9-NEXT: .LBB93_2: -; GFX9-NEXT: v_mov_b32_e32 v58, v50 -; GFX9-NEXT: v_mov_b32_e32 v45, v59 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v34, v35 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v49, v39 -; GFX9-NEXT: v_mov_b32_e32 v55, v22 -; GFX9-NEXT: v_mov_b32_e32 v51, v5 +; GFX9-NEXT: v_mov_b32_e32 v58, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v40 +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v46, v32 +; GFX9-NEXT: v_mov_b32_e32 v35, v32 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB93_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB93_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v16, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v16, v45, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v15, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v15, v46, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_addk_i32 s4, 0x300 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s24, s24, 3 -; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 ; GFX9-NEXT: s_add_i32 s26, s26, 3 -; GFX9-NEXT: s_lshl_b32 s6, s27, 8 -; GFX9-NEXT: s_add_i32 s20, s20, 3 -; GFX9-NEXT: s_lshl_b32 s7, s21, 8 -; GFX9-NEXT: s_add_i32 s22, s22, 3 -; GFX9-NEXT: s_lshl_b32 s8, s23, 8 -; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_lshl_b32 s9, s17, 8 -; GFX9-NEXT: s_add_i32 s18, s18, 3 -; GFX9-NEXT: s_lshl_b32 s10, s19, 8 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 -; GFX9-NEXT: v_or_b32_sdwa v23, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 -; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: s_and_b32 s4, s24, 0xff -; GFX9-NEXT: s_or_b32 s4, s5, s4 -; GFX9-NEXT: s_and_b32 s5, s26, 0xff ; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 ; GFX9-NEXT: s_or_b32 s6, s7, s6 -; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 ; GFX9-NEXT: s_or_b32 s7, s8, s7 -; GFX9-NEXT: s_and_b32 s8, s16, 0xff +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_or_b32 s8, s9, s8 -; GFX9-NEXT: s_and_b32 s9, s18, 0xff +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_or_b32 s9, s10, s9 -; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: s_or_b32 s10, s11, s10 ; GFX9-NEXT: s_addk_i32 s5, 0x300 ; GFX9-NEXT: s_addk_i32 s6, 0x300 ; GFX9-NEXT: s_addk_i32 s7, 0x300 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_addk_i32 s9, 0x300 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v4, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v24, v59, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v16 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v7, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v8, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v28, v48, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v28 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v9, 3, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v37, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 +; GFX9-NEXT: v_or_b32_sdwa v38, v38, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v10, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v37, v37, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v11, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v39, v39, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v12, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v48, v52, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v48 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v13, v51, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v14, 3, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v38, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v14, v47, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v17, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v18, 3, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v39, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v18, v43, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v48, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v19, v44, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v20, 3, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v20, v42, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v50, v34, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v51, v33, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v52, v63, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v53 +; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v25 +; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v14 +; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v23 +; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v18 +; GFX9-NEXT: v_add_u32_e32 v50, 0x300, v50 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v53, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v54, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v40, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v41, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v42, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v2 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v49, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v43, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v43 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v58 -; GFX9-NEXT: v_or_b32_sdwa v19, v51, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v51, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v44, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v52, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v45, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v62 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v46, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v47, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v56 +; GFX9-NEXT: v_add_u32_e32 v49, 0x300, v19 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v55 +; GFX9-NEXT: v_add_u32_e32 v55, 0x300, v44 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v46 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v47 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v56, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v58 +; GFX9-NEXT: v_or_b32_sdwa v57, v57, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v35 +; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v56 +; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v27 +; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v37 +; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v39 +; GFX9-NEXT: v_add_u32_e32 v39, 0x300, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v58, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v15, 0x300, v58 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v59, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v36 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v38 +; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v13 +; GFX9-NEXT: v_add_u32_e32 v13, 0x300, v57 +; GFX9-NEXT: v_add_u32_e32 v59, 0x300, v59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v34 -; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v35, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 -; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 +; GFX9-NEXT: v_or_b32_sdwa v60, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v60 -; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 -; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 -; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v41, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v46 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v36, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v22 -; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v52 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v42, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_or_b32_sdwa v24, v57, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 -; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 -; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 -; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 -; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 +; GFX9-NEXT: v_or_b32_sdwa v61, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v58, 0x300, v61 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 -; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v62, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v41, 0x300, v62 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v63, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v57, 0x300, v63 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v45, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v23 -; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v25 -; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v38 -; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v50 -; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v39 -; GFX9-NEXT: v_add_u32_e32 v39, 0x300, v49 -; GFX9-NEXT: v_add_u32_e32 v49, 0x300, v53 -; GFX9-NEXT: v_add_u32_e32 v50, 0x300, v55 -; GFX9-NEXT: v_add_u32_e32 v53, 0x300, v45 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v56, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v5 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v43, 0x300, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v7 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v44, 0x300, v6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v8 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v7 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v9 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v45, 0x300, v8 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v7, v45, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v10 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v9 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v11 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v46, 0x300, v10 +; GFX9-NEXT: v_lshl_or_b32 v6, v46, 16, v6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v12 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v22 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v51 +; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v52 +; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v54 +; GFX9-NEXT: v_add_u32_e32 v54, 0x300, v42 +; GFX9-NEXT: v_add_u32_e32 v42, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v5 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v8, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; GFX9-NEXT: v_lshl_or_b32 v9, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; GFX9-NEXT: v_lshl_or_b32 v10, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v20 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v53 +; GFX9-NEXT: v_add_u32_e32 v53, 0x300, v40 +; GFX9-NEXT: v_add_u32_e32 v40, 0x300, v60 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; GFX9-NEXT: v_add_u32_e32 v47, 0x300, v12 +; GFX9-NEXT: v_lshl_or_b32 v12, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v5, v47, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_lshl_or_b32 v16, v55, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v54, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 -; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 ; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 ; GFX9-NEXT: v_lshl_or_b32 v26, v37, 16, v26 ; GFX9-NEXT: v_lshl_or_b32 v27, v36, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 ; GFX9-NEXT: .LBB93_5: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -184877,14 +184364,9 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB93_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_and_b32 v1, 0xff, v35 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 @@ -184894,186 +184376,170 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s5, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s8 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s29, 8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v64 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v66 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v65 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v35 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v118 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v67 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v68 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v69 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v67 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v69 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v1, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v80 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v50 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v81 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v81 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v2, v71 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v51 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v83 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v3, v86 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v84 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v1.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v85 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v96 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v3, v86 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v54 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v98 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v87 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v99 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v97 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v102 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v101 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v1, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v114 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v113 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v112 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v116 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v128 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v134 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v133 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v134 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v161 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v129 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v147 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v166 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v144 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v147 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v3, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v148 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v149 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v151 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v2, v166 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v177 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v180 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v149 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v165 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v42 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v180 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v115 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v1, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v44 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v45 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v44 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v3, v45 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v135 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v145 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v59 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v56 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v60 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v3, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v150 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v160 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v62 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v2, v63 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v72 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v73 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v160 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v176 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v75 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v1, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v176 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v179 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v75 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v76 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v77 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v3, v77 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v40 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v43 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v78 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v79 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v89 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v40 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v91 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v88 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v3, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v47 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v58 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v93 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v3, v91 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v92 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v57 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v93 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v0.l -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v1, v92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v2.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB93_3 ; GFX11-TRUE16-NEXT: .LBB93_2: ; %cmp.true @@ -185649,233 +185115,211 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB93_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-FAKE16-NEXT: v_and_b32_e64 v5, 0xffff, s5 ; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-FAKE16-NEXT: s_or_b32 s11, s11, s12 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s26, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s11 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v66 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v68 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v4, v67 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v64 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v49 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v6, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v6, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v85 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v9, v83 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v100 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v96 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v11, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v99 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v12, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v97 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v13, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v12, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v14, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v15, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v15, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v16, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v17, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v165 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v42 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v18, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v19, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v20, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v131 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v145 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v59 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v21, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v22, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v23, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v24, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v163 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v176 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v75 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v24, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v25, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v26, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v27, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v183 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v43 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v78 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v89 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v27, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v28, v88 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v58 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v29, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v30, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v46 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v92 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v90 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB93_3 ; GFX11-FAKE16-NEXT: .LBB93_2: ; %cmp.true @@ -186320,101 +185764,100 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:92 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v10, v20 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v35 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v39 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v57, v45 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v60 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v60, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr51 @@ -186422,56 +185865,57 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 @@ -186494,24 +185938,24 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v46, v14 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; kill: killed $vgpr14 @@ -186650,10 +186094,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; kill: killed $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 ; SI-NEXT: ; kill: killed $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; kill: killed $vgpr14 @@ -186663,13 +186105,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; kill: killed $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: v_mov_b32_e32 v46, v5 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; kill: killed $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; kill: killed $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: v_mov_b32_e32 v45, v46 -; SI-NEXT: v_mov_b32_e32 v46, v6 -; SI-NEXT: v_mov_b32_e32 v6, v5 +; SI-NEXT: ; kill: killed $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; kill: killed $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr14 @@ -186677,193 +186120,156 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; kill: killed $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v16, v41, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_or_b32_e32 v17, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v45, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v41, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v43, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v54, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v53, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v51, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v52, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v49, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v50, v5, v14 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v1, v50, v49, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: v_or_b32_e32 v39, v5, v14 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v48, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v5, v14 -; SI-NEXT: v_alignbit_b32 v5, v41, v44, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v37, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v38, v5, v14 -; SI-NEXT: v_alignbit_b32 v5, v41, v44, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v5, v43, v45, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v35, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v36, v5, v14 -; SI-NEXT: v_alignbit_b32 v5, v41, v44, 8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v5, v43, v45, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v30, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v10 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v34, v5, v14 -; SI-NEXT: v_alignbit_b32 v5, v53, v54, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v5, v43, v45, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v28, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v29, v5, v14 -; SI-NEXT: v_alignbit_b32 v5, v53, v54, 16 +; SI-NEXT: v_alignbit_b32 v5, v53, v54, 24 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v26, v5, v14 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_or_b32_e32 v26, v14, v5 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v5, v14 -; SI-NEXT: v_alignbit_b32 v5, v53, v54, 8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v24, v47, v14 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v25, v5, v14 -; SI-NEXT: v_alignbit_b32 v5, v52, v51, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v27, v14, v5 +; SI-NEXT: v_alignbit_b32 v5, v53, v54, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59 -; SI-NEXT: v_or_b32_e32 v22, v58, v5 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v56 -; SI-NEXT: v_or_b32_e32 v23, v57, v5 -; SI-NEXT: v_alignbit_b32 v5, v52, v51, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 -; SI-NEXT: v_or_b32_e32 v20, v61, v5 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_or_b32_e32 v21, v60, v5 -; SI-NEXT: v_alignbit_b32 v5, v52, v51, 8 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 -; SI-NEXT: v_or_b32_e32 v18, v40, v5 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 -; SI-NEXT: v_or_b32_e32 v19, v55, v5 -; SI-NEXT: v_alignbit_b32 v5, v50, v49, 24 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 -; SI-NEXT: v_or_b32_e32 v16, v1, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_or_b32_e32 v17, v42, v1 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v24, v47, v5 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v42, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_or_b32_e32 v15, v3, v1 ; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v14, v4, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_or_b32_e32 v15, v2, v1 ; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 +; SI-NEXT: v_alignbit_b32 v1, v48, v39, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v48, v39, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 +; SI-NEXT: v_alignbit_b32 v1, v48, v39, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24 @@ -186873,31 +186279,28 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v34, v30, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v34, v30, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v34, v30, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v29, v28, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v29, v28, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v29, v28, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill @@ -186906,13 +186309,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v27, v26, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v27, v26, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v56 +; SI-NEXT: v_or_b32_e32 v25, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v5, v53, v54, 8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59 +; SI-NEXT: v_or_b32_e32 v22, v58, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v23, v57, v5 +; SI-NEXT: v_alignbit_b32 v5, v52, v51, 24 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_or_b32_e32 v20, v61, v5 ; SI-NEXT: v_alignbit_b32 v1, v25, v24, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v25, v24, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill @@ -186921,63 +186341,81 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v23, v22, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v23, v22, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_or_b32_e32 v21, v60, v5 +; SI-NEXT: v_alignbit_b32 v5, v52, v51, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_or_b32_e32 v18, v40, v5 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v1, v21, v20, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v21, v20, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v21, v20, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: v_or_b32_e32 v19, v55, v5 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v19, v18, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v19, v18, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v17, v16, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v17, v16, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v48 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 @@ -186987,46 +186425,46 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v31, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill @@ -187034,32 +186472,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_bfe_u32 v1, v11, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 +; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v56, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v47, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v40, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v46, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v44, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -187117,60 +186555,55 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_alignbit_b32 v5, v52, v51, 8 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v5, v29, v28, 16 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: .LBB94_2: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: s_xor_b64 exec, exec, s[4:5] +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v14, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_or_b32_e32 v15, v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v14, v14, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v44 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v16, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 +; SI-NEXT: v_or_b32_e32 v15, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v16, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v46 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 ; SI-NEXT: v_or_b32_e32 v17, v1, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 @@ -187185,7 +186618,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 @@ -187203,6 +186641,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v20, v2, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -187219,52 +186661,43 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v22, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v23, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v23, v1, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_or_b32_e32 v24, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v24, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_or_b32_e32 v25, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v26, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -187272,13 +186705,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v28, v4, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v27, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 @@ -187294,29 +186727,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 ; SI-NEXT: v_or_b32_e32 v29, v1, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_alignbit_b32 v5, v29, v28, 16 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 ; SI-NEXT: v_or_b32_e32 v30, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v34, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -187329,7 +186763,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v35, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -187337,13 +186771,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v37, v4, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v36, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 @@ -187359,29 +186793,29 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 ; SI-NEXT: v_or_b32_e32 v38, v1, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: v_or_b32_e32 v48, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: v_or_b32_e32 v39, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v39, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v48, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -187394,7 +186828,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v49, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -187402,13 +186836,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v51, v4, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v50, v3, v1 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 @@ -187424,13 +186858,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 ; SI-NEXT: v_or_b32_e32 v52, v1, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 ; SI-NEXT: v_or_b32_e32 v54, v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 @@ -187438,9 +186872,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v53, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 @@ -187457,49 +186891,52 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v44, v2, v1 +; SI-NEXT: v_or_b32_e32 v45, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 -; SI-NEXT: v_or_b32_e32 v41, v3, v1 -; SI-NEXT: v_alignbit_b32 v1, v41, v44, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v43, v3, v1 +; SI-NEXT: v_alignbit_b32 v1, v43, v45, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v41, v44, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v43, v45, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v43, v45, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v53, v54, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v52, v51, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v52, v51, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v52, v51, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v50, v49, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v50, v49, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v50, v49, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 24 +; SI-NEXT: v_alignbit_b32 v1, v48, v39, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v48, v39, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v39, v48, 8 +; SI-NEXT: v_alignbit_b32 v1, v48, v39, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 24 @@ -187509,31 +186946,28 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v38, v37, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v36, v35, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v34, v30, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v34, v30, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v34, v30, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v29, v28, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v29, v28, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v29, v28, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill @@ -187542,13 +186976,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v27, v26, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v27, v26, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v25, v24, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v25, v24, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill @@ -187557,63 +186991,63 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v23, v22, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v23, v22, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v21, v20, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v21, v20, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v21, v20, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v19, v18, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v19, v18, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v17, v16, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v17, v16, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v50 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v48 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 @@ -187623,46 +187057,46 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v31, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v13, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill @@ -187670,58 +187104,59 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_bfe_u32 v1, v11, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 +; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v7, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v1, v6, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_bfe_u32 v1, v42, 8, 8 +; SI-NEXT: v_bfe_u32 v1, v41, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v1, v55, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v1, v40, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v46, 8, 8 -; SI-NEXT: v_alignbit_b32 v5, v41, v44, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v45, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v44, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: .LBB94_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v45 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -187734,14 +187169,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -187752,7 +187187,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 @@ -187768,14 +187203,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v51 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -187786,8 +187221,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -187802,14 +187237,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -187821,7 +187256,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -187838,12 +187273,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -187855,13 +187290,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -187870,7 +187305,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload @@ -187907,11 +187342,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -187938,14 +187373,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -187956,14 +187391,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -187973,24 +187408,22 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 @@ -188006,14 +187439,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -188024,8 +187457,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -188043,7 +187476,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -188058,30 +187491,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -188092,7 +187527,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload @@ -188110,14 +187545,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -188131,7 +187566,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -188146,14 +187581,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -188164,7 +187599,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -188182,14 +187617,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -188201,7 +187636,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -188216,14 +187651,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -188234,15 +187669,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v45 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -188272,6 +187707,38 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-LABEL: bitcast_v64f16_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v31, v30 +; VI-NEXT: v_mov_b32_e32 v30, v29 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -188288,1078 +187755,1012 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v44, v12 -; VI-NEXT: v_mov_b32_e32 v12, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v20 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, v22 -; VI-NEXT: v_mov_b32_e32 v54, v21 -; VI-NEXT: v_mov_b32_e32 v31, v19 -; VI-NEXT: v_mov_b32_e32 v43, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v44 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v43 -; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr8 -; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v29 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: ; kill: killed $vgpr0 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v49 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; kill: killed $vgpr25 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v56, v38 -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v45, v7 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v15, v3 -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v28, v48 -; VI-NEXT: v_mov_b32_e32 v48, v16 -; VI-NEXT: v_mov_b32_e32 v16, v40 -; VI-NEXT: v_mov_b32_e32 v47, v39 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v63, v53 -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v32 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v31 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v17 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v44 -; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v32 -; VI-NEXT: v_lshrrev_b32_e32 v13, 24, v18 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v1 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v38 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v37 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v44 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v43 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v11 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v10 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v62, v36 -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[37:38] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[43:44] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[10:11] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v38 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v50 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49 -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v40 -; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v53 -; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v53 -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v52 -; VI-NEXT: v_lshrrev_b32_e32 v14, 24, v27 -; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v34 -; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[52:53] -; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[58:59] -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v27 -; VI-NEXT: v_mov_b32_e32 v53, v63 -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v40 -; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v6 -; VI-NEXT: v_mov_b32_e32 v7, v45 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 -; VI-NEXT: v_mov_b32_e32 v45, v60 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v3 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v59 -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[2:3] -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v58 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v26 -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[26:27] -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v34 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v36 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v36 -; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[35:36] -; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[49:50] -; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[39:40] -; VI-NEXT: v_mov_b32_e32 v58, v51 -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[33:34] -; VI-NEXT: v_mov_b32_e32 v36, v62 -; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[54:55] -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18] -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v34, v14 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v40, v16 -; VI-NEXT: v_mov_b32_e32 v16, v48 -; VI-NEXT: v_mov_b32_e32 v48, v28 -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v25, 24, v16 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v16 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v15 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 24, v14 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v14 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v13 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 24, v12 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v12 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v11 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 24, v10 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v10 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v9 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v7 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v5 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v4 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v55 -; VI-NEXT: v_mov_b32_e32 v3, v15 -; VI-NEXT: v_mov_b32_e32 v15, v29 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v38, v56 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v39 -; VI-NEXT: v_mov_b32_e32 v29, v41 -; VI-NEXT: v_mov_b32_e32 v39, v47 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v54 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v55 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v2 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 24, v49 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v49 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v48 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 24, v31 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v31 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v30 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v28 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v27 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v63, 24, v24 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v23 +; VI-NEXT: v_lshrrev_b32_e32 v38, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v21 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v18 +; VI-NEXT: v_mov_b32_e32 v29, v57 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[48:49] +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v26 +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[25:26] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[30:31] +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[21:22] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[19:20] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[17:18] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, v32 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v17 ; VI-NEXT: .LBB94_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: v_mov_b32_e32 v32, v45 +; VI-NEXT: s_xor_b64 exec, exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB94_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v63, 0x200 -; VI-NEXT: v_add_f16_sdwa v21, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 +; VI-NEXT: v_mov_b32_e32 v54, 0x200 +; VI-NEXT: v_add_f16_sdwa v25, v18, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v25 ; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 -; VI-NEXT: v_add_f16_sdwa v20, v17, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v58, v18, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; VI-NEXT: v_or_b32_e32 v57, v18, v29 +; VI-NEXT: v_add_f16_sdwa v29, v17, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 -; VI-NEXT: v_add_f16_sdwa v22, v32, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v57, v17, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 -; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 -; VI-NEXT: v_or_b32_e32 v15, v32, v0 -; VI-NEXT: v_add_f16_sdwa v0, v31, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: v_add_f16_sdwa v45, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 -; VI-NEXT: v_add_f16_sdwa v23, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_or_b32_e32 v14, v31, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; VI-NEXT: v_add_f16_e32 v55, 0x200, v55 -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_e32 v62, v55, v0 -; VI-NEXT: v_add_f16_sdwa v0, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 -; VI-NEXT: v_or_b32_e32 v61, v54, v0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v26, v54 -; VI-NEXT: v_mov_b32_e32 v27, v55 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_add_f16_sdwa v60, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 -; VI-NEXT: v_or_b32_e32 v34, v25, v0 -; VI-NEXT: v_add_f16_sdwa v0, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v33, v24, v0 -; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_or_b32_e32 v36, v2, v0 -; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v18, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_e32 v35, v1, v0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v55, v12, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v56, v17, v29 +; VI-NEXT: v_add_f16_sdwa v17, v20, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v59, v20, v29 +; VI-NEXT: v_add_f16_sdwa v29, v19, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_add_f16_sdwa v17, v22, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_add_f16_sdwa v20, v9, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v58, v19, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_add_f16_sdwa v17, v21, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v47, v22, v29 +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_add_f16_sdwa v17, v24, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_add_f16_sdwa v19, v4, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v22, v11, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v46, v21, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_add_f16_sdwa v17, v23, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v61, v24, v29 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v17 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_f16_sdwa v17, v26, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v21, v6, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v24, v13, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v60, v23, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_or_b32_e32 v63, v26, v29 +; VI-NEXT: v_add_f16_sdwa v29, v25, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_add_f16_sdwa v17, v28, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v26, v2, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_or_b32_e32 v38, v2, v0 -; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_e32 v37, v1, v0 -; VI-NEXT: v_add_f16_sdwa v0, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 -; VI-NEXT: v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v23, v8, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_f16_sdwa v47, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v62, v25, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_add_f16_sdwa v25, v27, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v34, v28, v29 +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v25 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v25, v30, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_add_f16_sdwa v17, v49, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v49, 0x200, v49 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v33, v27, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v45 +; VI-NEXT: v_or_b32_e32 v36, v31, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v25 +; VI-NEXT: v_or_b32_e32 v35, v30, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_add_f16_sdwa v27, v48, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v48, 0x200, v48 +; VI-NEXT: v_or_b32_e32 v40, v49, v29 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v27 +; VI-NEXT: v_add_f16_sdwa v17, v10, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_mov_b32_e32 v27, v30 +; VI-NEXT: v_add_f16_sdwa v25, v16, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v28, v31 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v39, v48, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v26 +; VI-NEXT: v_or_b32_e32 v38, v2, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_or_b32_e32 v37, v1, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v19 +; VI-NEXT: v_add_f16_sdwa v18, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v44, v4, v29 ; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_or_b32_e32 v49, v9, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; VI-NEXT: v_add_f16_sdwa v1, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v48, v8, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v31 -; VI-NEXT: v_add_f16_sdwa v8, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v10, v32 -; VI-NEXT: v_add_f16_e32 v43, 0x200, v43 -; VI-NEXT: v_or_b32_e32 v51, v3, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; VI-NEXT: v_or_b32_e32 v50, v2, v0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_f16_sdwa v3, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_or_b32_e32 v53, v2, v0 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_sdwa v3, v44, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v44, 0x200, v44 -; VI-NEXT: v_or_b32_e32 v52, v1, v0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_f16_sdwa v59, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_or_b32_e32 v46, v2, v0 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; VI-NEXT: v_or_b32_e32 v45, v1, v0 -; VI-NEXT: v_add_f16_sdwa v1, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; VI-NEXT: v_or_b32_e32 v5, v7, v0 -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_e32 v4, v6, v0 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_f16_sdwa v39, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_f16_sdwa v56, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v18 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v43, v3, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v21 +; VI-NEXT: v_add_f16_sdwa v18, v5, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v42, v6, v29 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v18 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v41, v5, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v23 +; VI-NEXT: v_add_f16_sdwa v18, v7, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v49, v8, v29 ; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 -; VI-NEXT: v_or_b32_e32 v41, v7, v0 -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; VI-NEXT: v_or_b32_e32 v40, v6, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 -; VI-NEXT: v_or_b32_e32 v7, v25, v0 -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v46 -; VI-NEXT: v_or_b32_e32 v6, v24, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; VI-NEXT: v_or_b32_e32 v32, v44, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; VI-NEXT: v_or_b32_e32 v31, v43, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; VI-NEXT: v_or_b32_e32 v30, v2, v0 -; VI-NEXT: v_add_f16_sdwa v2, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v55, 0x200, v55 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; VI-NEXT: v_or_b32_e32 v29, v1, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v34 -; VI-NEXT: v_or_b32_e32 v1, v55, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 -; VI-NEXT: v_or_b32_e32 v0, v54, v0 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[29:30] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[31:32] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v7 -; VI-NEXT: v_mov_b32_e32 v32, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v6 -; VI-NEXT: v_mov_b32_e32 v31, v9 -; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v41 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] -; VI-NEXT: v_mov_b32_e32 v7, v11 -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[40:41] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, v27 -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v54, v26 -; VI-NEXT: v_mov_b32_e32 v26, v20 -; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v5 -; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[4:5] -; VI-NEXT: v_mov_b32_e32 v5, v22 -; VI-NEXT: v_mov_b32_e32 v13, v21 -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[45:46] -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[50:51] -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v48 -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[48:49] -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v36 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 -; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[35:36] -; VI-NEXT: v_mov_b32_e32 v36, v2 -; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v15 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v14 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v53 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v52 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v50 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v49 -; VI-NEXT: v_mov_b32_e32 v48, v56 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v33 -; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[33:34] -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[14:15] -; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v58 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v62 -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v61 -; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[61:62] -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v57 -; VI-NEXT: v_mov_b32_e32 v9, v23 -; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v40 -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v14, v8 -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v40, v42 -; VI-NEXT: v_bfe_u32 v8, v42, 8, 8 -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v38 -; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v37 -; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[37:38] -; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v45 -; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[52:53] -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v51 -; VI-NEXT: v_mov_b32_e32 v38, v28 -; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[57:58] -; VI-NEXT: v_bfe_u32 v28, v36, 8, 8 -; VI-NEXT: v_bfe_u32 v29, v38, 8, 8 -; VI-NEXT: v_mov_b32_e32 v53, v3 -; VI-NEXT: v_bfe_u32 v15, v3, 8, 8 -; VI-NEXT: v_mov_b32_e32 v3, v59 -; VI-NEXT: v_bfe_u32 v51, v48, 8, 8 -; VI-NEXT: v_bfe_u32 v57, v7, 8, 8 -; VI-NEXT: v_bfe_u32 v58, v60, 8, 8 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v9, v9, 8, 8 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v5, v5, 8, 8 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v13, v13, 8, 8 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v18 +; VI-NEXT: v_or_b32_e32 v48, v7, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v17 +; VI-NEXT: v_or_b32_e32 v51, v10, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v20 +; VI-NEXT: v_or_b32_e32 v50, v9, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v55 +; VI-NEXT: v_or_b32_e32 v53, v12, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_add_f16_sdwa v20, v14, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v52, v11, v29 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v20 +; VI-NEXT: v_or_b32_e32 v32, v14, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v24 +; VI-NEXT: v_or_b32_e32 v31, v13, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v25 +; VI-NEXT: v_or_b32_e32 v30, v16, v29 +; VI-NEXT: v_add_f16_sdwa v29, v15, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_or_b32_e32 v29, v15, v29 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[29:30] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v30 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v29 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[31:32] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v32 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v31 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[52:53] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v53 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v52 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[50:51] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v51 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v50 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[48:49] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v49 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v48 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[41:42] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v42 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v41 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[43:44] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v44 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v43 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[37:38] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v38 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[39:40] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v40 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v39 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[35:36] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v36 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v35 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[33:34] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v33 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[62:63] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v63 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v62 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[60:61] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v61 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[46:47] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v47 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[58:59] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[56:57] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v20 +; VI-NEXT: v_bfe_u32 v18, v25, 8, 8 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v18, v32, 8, 8 +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v17, v55, 8, 8 +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v31, v28 +; VI-NEXT: v_mov_b32_e32 v30, v27 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v37 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v59 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v56 +; VI-NEXT: v_bfe_u32 v56, v23, 8, 8 +; VI-NEXT: v_bfe_u32 v59, v21, 8, 8 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v62, v19 +; VI-NEXT: v_bfe_u32 v41, v19, 8, 8 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v60 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v46 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v58 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v57 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_bfe_u32 v46, v26, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_bfe_u32 v63, v52, 8, 8 +; VI-NEXT: v_bfe_u32 v39, v51, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_bfe_u32 v17, v55, 8, 8 ; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_bfe_u32 v2, v2, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_bfe_u32 v42, v0, 8, 8 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_bfe_u32 v34, v62, 8, 8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v34, v47, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_bfe_u32 v0, v0, 8, 8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfe_u32 v47, v0, 8, 8 +; VI-NEXT: v_bfe_u32 v25, v40, 8, 8 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v25, v45, 8, 8 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_bfe_u32 v58, v50, 8, 8 +; VI-NEXT: v_bfe_u32 v38, v53, 8, 8 +; VI-NEXT: v_bfe_u32 v44, v47, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_bfe_u32 v33, v25, 8, 8 ; VI-NEXT: .LBB94_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v52 -; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, v12, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; VI-NEXT: v_or_b32_sdwa v1, v1, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v29, v25, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v46 +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41 +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51 -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v8 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v15 -; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v50 -; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 64, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v13 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x44, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v33 -; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x48, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x4c, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v61 -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x50, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v41 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v9 -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v33 +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x58, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v63 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v21 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v61 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v30 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38 +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x68, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v63 +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x70, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v39 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v58 +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7c, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -189385,65 +188786,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill @@ -189461,71 +188803,43 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -189537,6 +188851,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -189547,6 +188865,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -189557,6 +188879,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -189567,6 +188893,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -189577,196 +188907,295 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB94_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: s_waitcnt vmcnt(62) +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v3 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(62) +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 -; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[34:35], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 24, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v17 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v17 ; GFX9-NEXT: .LBB94_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB94_4 @@ -189777,507 +189206,521 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] ; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(60) +; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v29, v29, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] ; GFX9-NEXT: v_pk_add_f16 v28, v28, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v27, v27, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] ; GFX9-NEXT: v_pk_add_f16 v26, v26, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v25, v25, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] ; GFX9-NEXT: v_pk_add_f16 v24, v24, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v23, v23, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] ; GFX9-NEXT: v_pk_add_f16 v22, v22, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v21, v21, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v3 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 24, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v17 ; GFX9-NEXT: .LBB94_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v63 -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v38 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v44 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v12, v12, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v49 -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v42 -; GFX9-NEXT: v_or_b32_sdwa v34, v58, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v34 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v34, v34, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 -; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v61, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 @@ -190285,11 +189728,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -190298,10 +189741,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -190311,11 +189754,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -190324,12 +189767,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -190608,17 +190054,17 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v167.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v166.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v166.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v165.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v160.l @@ -190833,101 +190279,101 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 @@ -190936,375 +190382,375 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 ; GFX11-FAKE16-NEXT: .LBB94_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB94_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 ; GFX11-FAKE16-NEXT: .LBB94_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v74 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v73 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v68 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v60 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v47 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v68 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v63 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v54 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v62 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v68, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v57 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v43 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v69, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v67 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v68, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v64, v53 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v177 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v68, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v165 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v151 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v160 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v66, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v146 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v145 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v148 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v144 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v145 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v133 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v66, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v134 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v132 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v129 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v119 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v117 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v116 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v64, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v54, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v66, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v68, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64 ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v64 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v101 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v97 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v98 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v97 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v87 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v83 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v70 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v68 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v71 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v75 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v73 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v53 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 @@ -191326,30 +190772,30 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v59 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v60 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v59 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v46 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v50 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v45 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v41 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v41 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v182 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v180 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v181 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v176 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v166 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v167 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 @@ -191371,29 +190817,29 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v163 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v160 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v149 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v147 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v144 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v132 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v131 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v119 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 @@ -191416,31 +190862,31 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v113 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v112 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v113 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v102 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v100 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v87 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v86 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v71 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v80 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v70 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 @@ -191513,8 +190959,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill @@ -191538,16 +190984,17 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 @@ -191560,475 +191007,448 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v63, s34, 2 ; SI-NEXT: v_writelane_b32 v63, s35, 3 ; SI-NEXT: v_writelane_b32 v63, s36, 4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_writelane_b32 v63, s37, 5 ; SI-NEXT: v_writelane_b32 v63, s38, 6 ; SI-NEXT: v_writelane_b32 v63, s39, 7 ; SI-NEXT: v_writelane_b32 v63, s48, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 ; SI-NEXT: v_writelane_b32 v63, s49, 9 ; SI-NEXT: v_writelane_b32 v63, s50, 10 ; SI-NEXT: v_writelane_b32 v63, s51, 11 ; SI-NEXT: v_writelane_b32 v63, s52, 12 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; SI-NEXT: v_writelane_b32 v63, s53, 13 ; SI-NEXT: v_writelane_b32 v63, s54, 14 ; SI-NEXT: v_writelane_b32 v63, s55, 15 ; SI-NEXT: v_writelane_b32 v63, s64, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 ; SI-NEXT: v_writelane_b32 v63, s65, 17 ; SI-NEXT: v_writelane_b32 v63, s66, 18 ; SI-NEXT: v_writelane_b32 v63, s67, 19 ; SI-NEXT: v_writelane_b32 v63, s68, 20 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 ; SI-NEXT: v_writelane_b32 v63, s69, 21 ; SI-NEXT: v_writelane_b32 v63, s70, 22 ; SI-NEXT: v_writelane_b32 v63, s71, 23 ; SI-NEXT: v_writelane_b32 v63, s80, 24 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 ; SI-NEXT: v_writelane_b32 v63, s81, 25 ; SI-NEXT: v_writelane_b32 v63, s82, 26 ; SI-NEXT: v_writelane_b32 v63, s83, 27 ; SI-NEXT: v_writelane_b32 v63, s84, 28 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 ; SI-NEXT: v_writelane_b32 v63, s85, 29 ; SI-NEXT: v_writelane_b32 v63, s86, 30 ; SI-NEXT: v_writelane_b32 v63, s87, 31 -; SI-NEXT: v_mov_b32_e32 v46, v29 ; SI-NEXT: v_writelane_b32 v63, s96, 32 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v36 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v28, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v38 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v60 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v51 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v52 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v55, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v40 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v43, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 ; SI-NEXT: v_writelane_b32 v63, s97, 33 ; SI-NEXT: v_writelane_b32 v63, s98, 34 ; SI-NEXT: v_writelane_b32 v63, s99, 35 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB95_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readfirstlane_b32 s4, v19 +; SI-NEXT: v_readfirstlane_b32 s4, v23 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: v_readfirstlane_b32 s5, v21 ; SI-NEXT: s_or_b32 s44, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: v_readfirstlane_b32 s4, v13 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v17 +; SI-NEXT: v_readfirstlane_b32 s5, v16 ; SI-NEXT: s_or_b32 s45, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 4 -; SI-NEXT: v_writelane_b32 v62, s5, 5 -; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 16 ; SI-NEXT: v_writelane_b32 v62, s4, 2 ; SI-NEXT: v_writelane_b32 v62, s5, 3 ; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 0 -; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v17 ; SI-NEXT: v_writelane_b32 v62, s5, 1 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v37 +; SI-NEXT: v_readfirstlane_b32 s5, v39 ; SI-NEXT: s_or_b32 s42, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_readfirstlane_b32 s4, v50 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v22 +; SI-NEXT: v_readfirstlane_b32 s5, v38 ; SI-NEXT: s_or_b32 s43, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 10 -; SI-NEXT: v_writelane_b32 v62, s5, 11 -; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 16 ; SI-NEXT: v_writelane_b32 v62, s4, 8 ; SI-NEXT: v_writelane_b32 v62, s5, 9 -; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 16 ; SI-NEXT: v_writelane_b32 v62, s4, 6 ; SI-NEXT: v_writelane_b32 v62, s5, 7 -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 4 +; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: v_writelane_b32 v62, s5, 5 ; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v19 ; SI-NEXT: s_or_b32 s40, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v30 +; SI-NEXT: v_readfirstlane_b32 s4, v54 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v39 +; SI-NEXT: v_readfirstlane_b32 s5, v18 ; SI-NEXT: s_or_b32 s41, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 16 -; SI-NEXT: v_writelane_b32 v62, s5, 17 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 ; SI-NEXT: v_writelane_b32 v62, s4, 14 ; SI-NEXT: v_writelane_b32 v62, s5, 15 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 ; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: v_writelane_b32 v62, s5, 13 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_writelane_b32 v62, s5, 11 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v20 +; SI-NEXT: v_readfirstlane_b32 s5, v22 ; SI-NEXT: s_or_b32 s28, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v47 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_mov_b32_e32 v22, v2 -; SI-NEXT: v_mov_b32_e32 v39, v5 -; SI-NEXT: v_mov_b32_e32 v60, v16 -; SI-NEXT: v_readfirstlane_b32 s46, v55 -; SI-NEXT: v_mov_b32_e32 v17, v43 -; SI-NEXT: v_mov_b32_e32 v40, v34 -; SI-NEXT: v_mov_b32_e32 v41, v21 -; SI-NEXT: v_mov_b32_e32 v51, v42 -; SI-NEXT: s_lshr_b32 s71, s45, 8 -; SI-NEXT: s_lshr_b32 s70, s43, 8 -; SI-NEXT: s_lshr_b32 s69, s41, 8 -; SI-NEXT: v_bfe_u32 v38, v47, 8, 8 -; SI-NEXT: v_bfe_u32 v37, v33, 8, 8 -; SI-NEXT: v_bfe_u32 v35, v32, 8, 8 -; SI-NEXT: v_bfe_u32 v20, v10, 8, 8 -; SI-NEXT: v_bfe_u32 v19, v9, 8, 8 +; SI-NEXT: v_readfirstlane_b32 s5, v46 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s4, v42 +; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_or_b32 s29, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 22 -; SI-NEXT: v_writelane_b32 v62, s5, 23 -; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 ; SI-NEXT: v_writelane_b32 v62, s4, 20 ; SI-NEXT: v_writelane_b32 v62, s5, 21 -; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 ; SI-NEXT: v_writelane_b32 v62, s4, 18 ; SI-NEXT: v_writelane_b32 v62, s5, 19 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_readfirstlane_b32 s5, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v45 +; SI-NEXT: v_writelane_b32 v62, s5, 17 ; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_mov_b32_e32 v16, v45 +; SI-NEXT: v_mov_b32_e32 v45, v10 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: s_lshr_b64 s[48:49], s[44:45], 16 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: v_bfe_u32 v23, v13, 8, 8 +; SI-NEXT: v_bfe_u32 v22, v42, 8, 8 +; SI-NEXT: v_bfe_u32 v48, v33, 8, 8 +; SI-NEXT: v_bfe_u32 v21, v32, 8, 8 +; SI-NEXT: v_bfe_u32 v39, v31, 8, 8 +; SI-NEXT: v_bfe_u32 v20, v11, 8, 8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s5, v18 ; SI-NEXT: s_or_b32 s26, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v43 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s5, v17 ; SI-NEXT: s_or_b32 s27, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 28 -; SI-NEXT: v_writelane_b32 v62, s5, 29 -; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 ; SI-NEXT: v_writelane_b32 v62, s4, 26 ; SI-NEXT: v_writelane_b32 v62, s5, 27 -; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 ; SI-NEXT: v_writelane_b32 v62, s4, 24 ; SI-NEXT: v_writelane_b32 v62, s5, 25 -; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 22 +; SI-NEXT: v_readfirstlane_b32 s4, v46 +; SI-NEXT: v_writelane_b32 v62, s5, 23 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshr_b32 s68, s29, 8 -; SI-NEXT: s_lshr_b32 s66, s27, 8 -; SI-NEXT: v_bfe_u32 v43, v31, 8, 8 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s5, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s5, v8 ; SI-NEXT: s_or_b32 s24, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_mov_b32_e32 v17, v9 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_bfe_u32 v15, v5, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_readfirstlane_b32 s5, v17 ; SI-NEXT: s_or_b32 s25, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 34 -; SI-NEXT: v_writelane_b32 v62, s5, 35 -; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 32 -; SI-NEXT: v_writelane_b32 v62, s5, 33 -; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 30 -; SI-NEXT: v_readfirstlane_b32 s4, v7 ; SI-NEXT: v_writelane_b32 v62, s5, 31 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 28 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: v_writelane_b32 v62, s5, 29 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v29 +; SI-NEXT: v_readfirstlane_b32 s5, v45 ; SI-NEXT: s_or_b32 s22, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v6 +; SI-NEXT: v_readfirstlane_b32 s5, v14 ; SI-NEXT: s_or_b32 s23, s5, s4 ; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 40 -; SI-NEXT: v_writelane_b32 v62, s5, 41 -; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 -; SI-NEXT: v_writelane_b32 v62, s4, 38 -; SI-NEXT: v_writelane_b32 v62, s5, 39 -; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 8 -; SI-NEXT: v_writelane_b32 v62, s4, 36 -; SI-NEXT: v_readfirstlane_b32 s4, v58 -; SI-NEXT: v_writelane_b32 v62, s5, 37 +; SI-NEXT: v_writelane_b32 v62, s4, 32 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: v_writelane_b32 v62, s5, 33 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v13 +; SI-NEXT: v_readfirstlane_b32 s5, v15 ; SI-NEXT: s_or_b32 s20, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: v_readfirstlane_b32 s4, v11 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: v_readfirstlane_b32 s5, v34 ; SI-NEXT: s_or_b32 s21, s5, s4 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 -; SI-NEXT: v_writelane_b32 v62, s4, 44 -; SI-NEXT: v_writelane_b32 v62, s5, 45 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 -; SI-NEXT: v_mov_b32_e32 v58, v11 -; SI-NEXT: v_writelane_b32 v62, s4, 42 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_writelane_b32 v62, s5, 43 -; SI-NEXT: v_readfirstlane_b32 s5, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v7, v29 -; SI-NEXT: v_mov_b32_e32 v29, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_readfirstlane_b32 s5, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s4, v6 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_or_b32 s18, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s5, v4 +; SI-NEXT: v_mov_b32_e32 v3, v58 +; SI-NEXT: v_mov_b32_e32 v2, v51 +; SI-NEXT: v_mov_b32_e32 v4, v56 +; SI-NEXT: s_lshr_b64 s[50:51], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[52:53], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[64:65], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[54:55], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[66:67], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 8 +; SI-NEXT: v_bfe_u32 v15, v4, 8, 8 +; SI-NEXT: v_mov_b32_e32 v34, v18 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s4, v5 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v61 ; SI-NEXT: s_or_b32 s19, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v21 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v24 ; SI-NEXT: s_or_b32 s16, s5, s4 -; SI-NEXT: v_mov_b32_e32 v1, v53 -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v21, v24 -; SI-NEXT: s_lshr_b32 s64, s25, 8 -; SI-NEXT: s_lshr_b32 s54, s23, 8 -; SI-NEXT: s_lshr_b32 s52, s21, 8 -; SI-NEXT: s_lshr_b32 s50, s19, 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[98:99], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 8 -; SI-NEXT: v_mov_b32_e32 v13, v12 -; SI-NEXT: v_bfe_u32 v24, v12, 8, 8 -; SI-NEXT: v_mov_b32_e32 v12, v48 -; SI-NEXT: v_bfe_u32 v48, v48, 8, 8 -; SI-NEXT: v_bfe_u32 v61, v59, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: v_readfirstlane_b32 s4, v26 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_bfe_u32 v18, v11, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: v_readfirstlane_b32 s5, v25 ; SI-NEXT: s_or_b32 s17, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s4, v6 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v42 +; SI-NEXT: s_lshr_b64 s[68:69], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[80:81], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[84:85], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[82:83], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[98:99], s[16:17], 8 +; SI-NEXT: v_bfe_u32 v38, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v24, v37, 8, 8 +; SI-NEXT: v_mov_b32_e32 v19, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s5, v7 ; SI-NEXT: s_or_b32 s14, s5, s4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: v_readfirstlane_b32 s4, v28 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v46 +; SI-NEXT: v_readfirstlane_b32 s5, v30 ; SI-NEXT: s_or_b32 s15, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: v_readfirstlane_b32 s4, v29 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v23 +; SI-NEXT: v_readfirstlane_b32 s5, v35 ; SI-NEXT: s_or_b32 s12, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: v_readfirstlane_b32 s4, v27 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v3 +; SI-NEXT: v_readfirstlane_b32 s5, v36 ; SI-NEXT: s_or_b32 s13, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v27 +; SI-NEXT: v_readfirstlane_b32 s4, v1 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v28 +; SI-NEXT: v_readfirstlane_b32 s5, v12 ; SI-NEXT: s_or_b32 s10, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v45 +; SI-NEXT: v_readfirstlane_b32 s5, v49 ; SI-NEXT: s_or_b32 s11, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_readfirstlane_b32 s4, v59 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v57 +; SI-NEXT: v_readfirstlane_b32 s5, v44 ; SI-NEXT: s_or_b32 s8, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v59 +; SI-NEXT: v_readfirstlane_b32 s4, v58 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v56 +; SI-NEXT: v_readfirstlane_b32 s5, v57 ; SI-NEXT: s_or_b32 s9, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v26 +; SI-NEXT: v_readfirstlane_b32 s4, v61 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v25 +; SI-NEXT: v_readfirstlane_b32 s5, v60 ; SI-NEXT: s_or_b32 s6, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v50 +; SI-NEXT: v_readfirstlane_b32 s5, v53 ; SI-NEXT: s_or_b32 s7, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_readfirstlane_b32 s4, v47 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v54 +; SI-NEXT: v_readfirstlane_b32 s5, v55 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s5, v53 +; SI-NEXT: v_readfirstlane_b32 s5, v56 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_or_b32 s5, s46, s5 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 48 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 49 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 46 -; SI-NEXT: s_lshr_b32 s48, s17, 8 -; SI-NEXT: s_lshr_b32 s67, s15, 8 -; SI-NEXT: s_lshr_b32 s65, s13, 8 -; SI-NEXT: s_lshr_b32 s55, s11, 8 -; SI-NEXT: s_lshr_b32 s53, s9, 8 -; SI-NEXT: s_lshr_b32 s51, s7, 8 -; SI-NEXT: s_lshr_b32 s49, s5, 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 +; SI-NEXT: v_readfirstlane_b32 s47, v43 +; SI-NEXT: s_or_b32 s5, s47, s5 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 ; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[86:87], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 -; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: v_mov_b32_e32 v57, v30 -; SI-NEXT: v_bfe_u32 v50, v30, 8, 8 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v33 -; SI-NEXT: v_mov_b32_e32 v30, v32 -; SI-NEXT: v_mov_b32_e32 v32, v31 -; SI-NEXT: v_mov_b32_e32 v31, v10 -; SI-NEXT: v_mov_b32_e32 v54, v9 -; SI-NEXT: v_bfe_u32 v42, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v45, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v47, v52, 8, 8 -; SI-NEXT: v_bfe_u32 v33, v1, 8, 8 -; SI-NEXT: v_mov_b32_e32 v3, v14 -; SI-NEXT: v_mov_b32_e32 v25, v59 -; SI-NEXT: v_mov_b32_e32 v1, v52 -; SI-NEXT: v_mov_b32_e32 v44, v11 +; SI-NEXT: s_lshr_b64 s[78:79], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 16 +; SI-NEXT: v_mov_b32_e32 v57, v54 +; SI-NEXT: v_bfe_u32 v49, v54, 8, 8 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: v_bfe_u32 v3, v3, 8, 8 +; SI-NEXT: s_lshr_b64 s[96:97], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[38:39], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s37, s45, 8 +; SI-NEXT: s_lshr_b32 s35, s43, 8 +; SI-NEXT: s_lshr_b32 s31, s41, 8 +; SI-NEXT: s_lshr_b32 s95, s29, 8 +; SI-NEXT: s_lshr_b32 s93, s27, 8 +; SI-NEXT: s_lshr_b32 s91, s25, 8 +; SI-NEXT: s_lshr_b32 s89, s23, 8 +; SI-NEXT: s_lshr_b32 s79, s21, 8 +; SI-NEXT: s_lshr_b32 s75, s19, 8 +; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: s_lshr_b32 s77, s15, 8 +; SI-NEXT: s_lshr_b32 s73, s13, 8 +; SI-NEXT: s_lshr_b32 s61, s11, 8 +; SI-NEXT: s_lshr_b32 s59, s9, 8 +; SI-NEXT: s_lshr_b32 s57, s7, 8 +; SI-NEXT: s_lshr_b32 s47, s5, 8 +; SI-NEXT: v_mov_b32_e32 v53, v13 +; SI-NEXT: v_mov_b32_e32 v13, v50 +; SI-NEXT: v_bfe_u32 v50, v50, 8, 8 +; SI-NEXT: v_mov_b32_e32 v12, v42 +; SI-NEXT: v_mov_b32_e32 v61, v33 +; SI-NEXT: v_mov_b32_e32 v60, v32 +; SI-NEXT: v_mov_b32_e32 v58, v31 +; SI-NEXT: v_bfe_u32 v1, v26, 8, 8 +; SI-NEXT: v_bfe_u32 v47, v28, 8, 8 +; SI-NEXT: v_bfe_u32 v43, v27, 8, 8 +; SI-NEXT: v_bfe_u32 v55, v2, 8, 8 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v59, v8 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: v_mov_b32_e32 v42, v14 +; SI-NEXT: v_mov_b32_e32 v46, v7 +; SI-NEXT: v_mov_b32_e32 v2, v5 ; SI-NEXT: s_branch .LBB95_3 ; SI-NEXT: .LBB95_2: -; SI-NEXT: v_mov_b32_e32 v60, v16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v7, v29 -; SI-NEXT: v_mov_b32_e32 v29, v6 -; SI-NEXT: v_mov_b32_e32 v39, v5 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v62, s4, 0 ; SI-NEXT: v_writelane_b32 v62, s5, 1 @@ -192077,868 +191497,817 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v62, s4, 30 ; SI-NEXT: v_writelane_b32 v62, s5, 31 +; SI-NEXT: v_mov_b32_e32 v53, v13 +; SI-NEXT: v_mov_b32_e32 v13, v50 +; SI-NEXT: v_mov_b32_e32 v57, v54 +; SI-NEXT: v_mov_b32_e32 v54, v58 +; SI-NEXT: v_mov_b32_e32 v52, v12 +; SI-NEXT: v_mov_b32_e32 v12, v42 +; SI-NEXT: v_mov_b32_e32 v61, v33 +; SI-NEXT: v_mov_b32_e32 v60, v32 +; SI-NEXT: v_mov_b32_e32 v58, v31 +; SI-NEXT: v_mov_b32_e32 v40, v7 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: v_mov_b32_e32 v16, v45 +; SI-NEXT: s_mov_b64 vcc, -1 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v62, s4, 32 -; SI-NEXT: v_writelane_b32 v62, s5, 33 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v62, s4, 34 -; SI-NEXT: v_writelane_b32 v62, s5, 35 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v62, s4, 36 -; SI-NEXT: v_writelane_b32 v62, s5, 37 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v62, s4, 38 -; SI-NEXT: v_writelane_b32 v62, s5, 39 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v62, s4, 40 -; SI-NEXT: v_writelane_b32 v62, s5, 41 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v62, s4, 42 -; SI-NEXT: v_writelane_b32 v62, s5, 43 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v62, s4, 44 -; SI-NEXT: v_writelane_b32 v62, s5, 45 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: v_writelane_b32 v62, s80, 46 -; SI-NEXT: v_writelane_b32 v62, s81, 47 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: v_writelane_b32 v62, s80, 48 -; SI-NEXT: v_mov_b32_e32 v51, v42 -; SI-NEXT: v_mov_b32_e32 v41, v21 -; SI-NEXT: v_mov_b32_e32 v21, v24 -; SI-NEXT: v_mov_b32_e32 v40, v34 -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v13, v12 -; SI-NEXT: v_mov_b32_e32 v12, v48 -; SI-NEXT: v_mov_b32_e32 v57, v30 -; SI-NEXT: v_mov_b32_e32 v58, v11 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v33 -; SI-NEXT: v_mov_b32_e32 v30, v32 -; SI-NEXT: v_mov_b32_e32 v32, v31 -; SI-NEXT: v_mov_b32_e32 v31, v10 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v54, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v22, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v17, v43 -; SI-NEXT: s_mov_b64 vcc, -1 -; SI-NEXT: v_writelane_b32 v62, s81, 49 -; SI-NEXT: v_mov_b32_e32 v25, v59 -; SI-NEXT: v_mov_b32_e32 v1, v52 +; SI-NEXT: v_mov_b32_e32 v17, v9 +; SI-NEXT: v_mov_b32_e32 v44, v8 +; SI-NEXT: v_mov_b32_e32 v42, v14 +; SI-NEXT: v_mov_b32_e32 v45, v10 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $sgpr31 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $sgpr95 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $sgpr93 ; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr91 ; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: v_writelane_b32 v62, s5, 33 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr59 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: .LBB95_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v14, v17 -; SI-NEXT: v_mov_b32_e32 v17, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v4, v37 +; SI-NEXT: v_mov_b32_e32 v37, v54 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v12 +; SI-NEXT: v_mov_b32_e32 v12, v13 +; SI-NEXT: v_mov_b32_e32 v13, v53 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: v_mov_b32_e32 v5, v27 ; SI-NEXT: s_andn2_b64 vcc, exec, vcc -; SI-NEXT: v_mov_b32_e32 v23, v2 -; SI-NEXT: v_mov_b32_e32 v59, v34 -; SI-NEXT: v_mov_b32_e32 v2, v25 +; SI-NEXT: v_mov_b32_e32 v25, v16 +; SI-NEXT: v_mov_b32_e32 v27, v51 +; SI-NEXT: v_mov_b32_e32 v16, v15 ; SI-NEXT: s_cbranch_vccnz .LBB95_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_mov_b32_e32 v52, v29 -; SI-NEXT: v_mov_b32_e32 v29, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v18, v14 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mov_b32_e32 v15, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v3 +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v58 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_bfe_u32 v49, v57, 8, 8 +; SI-NEXT: v_bfe_u32 v39, v58, 8, 8 +; SI-NEXT: v_bfe_u32 v50, v12, 8, 8 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_bfe_u32 v50, v57, 8, 8 -; SI-NEXT: v_bfe_u32 v48, v12, 8, 8 -; SI-NEXT: v_bfe_u32 v43, v32, 8, 8 -; SI-NEXT: v_bfe_u32 v24, v13, 8, 8 -; SI-NEXT: v_bfe_u32 v20, v31, 8, 8 -; SI-NEXT: v_bfe_u32 v19, v54, 8, 8 -; SI-NEXT: v_bfe_u32 v42, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v45, v55, 8, 8 -; SI-NEXT: v_bfe_u32 v61, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v33, v53, 8, 8 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s4, v14 -; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_bfe_u32 v47, v28, 8, 8 +; SI-NEXT: v_bfe_u32 v43, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v24, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v23, v13, 8, 8 +; SI-NEXT: v_bfe_u32 v18, v37, 8, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v44 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 -; SI-NEXT: v_readfirstlane_b32 s5, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v14 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s5, v53 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s5, v56 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_readfirstlane_b32 s6, v16 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 -; SI-NEXT: s_lshr_b32 s49, s5, 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[4:5], 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s6, v14 -; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: s_lshl_b32 s6, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s7, v14 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v27 ; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: v_bfe_u32 v55, v27, 8, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[86:87], s[6:7], 8 -; SI-NEXT: s_lshr_b32 s51, s7, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s8, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v8 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s8, v14 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_readfirstlane_b32 s9, v14 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readfirstlane_b32 s9, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v37 ; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: s_lshr_b64 s[90:91], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 16 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; SI-NEXT: s_lshr_b64 s[34:35], s[6:7], 8 +; SI-NEXT: v_bfe_u32 v20, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v38, v2, 8, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 -; SI-NEXT: s_lshr_b32 s53, s9, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s10, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s10, v14 ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_readfirstlane_b32 s11, v14 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_readfirstlane_b32 s11, v55 +; SI-NEXT: v_readfirstlane_b32 s11, v4 ; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[8:9], 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s12, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v35 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 -; SI-NEXT: s_lshr_b32 s55, s11, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s12, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s12, v14 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_readfirstlane_b32 s13, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: v_readfirstlane_b32 s13, v5 ; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[10:11], 8 ; SI-NEXT: v_readfirstlane_b32 s14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[12:13], 8 -; SI-NEXT: s_lshr_b32 s65, s13, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s14, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_readfirstlane_b32 s15, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_readfirstlane_b32 s15, v6 +; SI-NEXT: v_readfirstlane_b32 s15, v28 ; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 8 ; SI-NEXT: v_readfirstlane_b32 s16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 ; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[14:15], 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_lshr_b64 s[30:31], s[14:15], 8 -; SI-NEXT: s_lshr_b32 s67, s15, 8 -; SI-NEXT: v_readfirstlane_b32 s16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v21 +; SI-NEXT: v_readfirstlane_b32 s16, v15 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[96:97], s[14:15], 24 +; SI-NEXT: s_lshr_b32 s77, s15, 8 ; SI-NEXT: v_readfirstlane_b32 s17, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readfirstlane_b32 s17, v44 +; SI-NEXT: v_readfirstlane_b32 s17, v26 ; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshr_b32 s73, s13, 8 +; SI-NEXT: s_lshr_b32 s61, s11, 8 +; SI-NEXT: s_lshr_b32 s59, s9, 8 +; SI-NEXT: s_lshr_b32 s57, s7, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s18, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 -; SI-NEXT: s_lshr_b32 s48, s17, 8 -; SI-NEXT: v_readfirstlane_b32 s18, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v60 +; SI-NEXT: s_lshr_b64 s[82:83], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[98:99], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_readfirstlane_b32 s18, v15 ; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s19, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readfirstlane_b32 s19, v54 +; SI-NEXT: v_readfirstlane_b32 s19, v2 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s20, v14 -; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[98:99], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 8 -; SI-NEXT: s_lshr_b32 s50, s19, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: s_lshr_b64 s[68:69], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[80:81], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[84:85], s[18:19], 8 +; SI-NEXT: s_lshr_b32 s75, s19, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_readfirstlane_b32 s20, v15 ; SI-NEXT: s_lshl_b32 s20, s20, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s21, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 ; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: v_readfirstlane_b32 s21, v31 +; SI-NEXT: v_readfirstlane_b32 s21, v3 ; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s22, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v45 ; SI-NEXT: s_or_b32 s21, s22, s21 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 8 -; SI-NEXT: s_lshr_b32 s52, s21, 8 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s22, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 -; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[66:67], s[20:21], 16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 8 +; SI-NEXT: s_lshr_b32 s79, s21, 8 ; SI-NEXT: v_readfirstlane_b32 s23, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 -; SI-NEXT: s_or_b32 s22, s23, s22 -; SI-NEXT: v_readfirstlane_b32 s23, v32 -; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s24, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 -; SI-NEXT: s_or_b32 s23, s24, s23 -; SI-NEXT: s_lshr_b32 s54, s23, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v42 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s24, v14 -; SI-NEXT: s_lshl_b32 s24, s24, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s25, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 -; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 -; SI-NEXT: v_readfirstlane_b32 s25, v30 -; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: v_bfe_u32 v35, v30, 8, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 -; SI-NEXT: s_or_b32 s25, s26, s25 -; SI-NEXT: s_lshr_b32 s64, s25, 8 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s26, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 -; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s27, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_readfirstlane_b32 s22, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v44 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: v_readfirstlane_b32 s23, v58 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_or_b32 s23, s24, s23 +; SI-NEXT: s_lshr_b64 s[52:53], s[22:23], 16 +; SI-NEXT: v_readfirstlane_b32 s24, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: s_lshr_b64 s[64:65], s[22:23], 8 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 +; SI-NEXT: s_lshr_b32 s89, s23, 8 +; SI-NEXT: v_readfirstlane_b32 s25, v60 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_or_b32 s25, s26, s25 +; SI-NEXT: s_lshr_b64 s[50:51], s[24:25], 8 +; SI-NEXT: v_readfirstlane_b32 s26, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 +; SI-NEXT: s_lshl_b32 s26, s26, 16 ; SI-NEXT: s_or_b32 s26, s27, s26 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v18 -; SI-NEXT: v_bfe_u32 v18, v44, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s27, v46 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_lshr_b32 s91, s25, 8 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v15 +; SI-NEXT: v_bfe_u32 v21, v60, 8, 8 +; SI-NEXT: v_readfirstlane_b32 s27, v61 ; SI-NEXT: s_lshl_b32 s27, s27, 16 -; SI-NEXT: v_bfe_u32 v37, v46, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s28, v14 -; SI-NEXT: s_or_b32 s27, s28, s27 -; SI-NEXT: s_lshr_b32 s66, s27, 8 +; SI-NEXT: v_bfe_u32 v48, v61, 8, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s28, v14 -; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_or_b32 s27, s28, s27 +; SI-NEXT: s_lshr_b32 s93, s27, 8 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_readfirstlane_b32 s28, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v33 +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_bfe_u32 v22, v33, 8, 8 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s29, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v56 ; SI-NEXT: s_or_b32 s28, s29, s28 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_bfe_u32 v15, v5, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s29, v56 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s29, v33 ; SI-NEXT: s_lshl_b32 s29, s29, 16 -; SI-NEXT: v_bfe_u32 v38, v56, 8, 8 -; SI-NEXT: v_readfirstlane_b32 s40, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_or_b32 s29, s40, s29 -; SI-NEXT: s_lshr_b32 s68, s29, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s40, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_lshl_b32 s40, s40, 16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: s_or_b32 s29, s40, s29 +; SI-NEXT: s_lshr_b32 s95, s29, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s41, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_readfirstlane_b32 s40, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s40, s40, 16 ; SI-NEXT: s_or_b32 s40, s41, s40 ; SI-NEXT: v_readfirstlane_b32 s41, v57 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_lshl_b32 s41, s41, 16 -; SI-NEXT: v_readfirstlane_b32 s42, v14 -; SI-NEXT: s_or_b32 s41, s42, s41 -; SI-NEXT: s_lshr_b32 s69, s41, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s42, v14 -; SI-NEXT: s_lshl_b32 s42, s42, 16 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_or_b32 s41, s42, s41 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_lshr_b32 s31, s41, 8 ; SI-NEXT: v_readfirstlane_b32 s43, v14 +; SI-NEXT: v_readfirstlane_b32 s42, v15 +; SI-NEXT: s_lshl_b32 s42, s42, 16 ; SI-NEXT: s_or_b32 s42, s43, s42 ; SI-NEXT: v_readfirstlane_b32 s43, v12 ; SI-NEXT: s_lshl_b32 s43, s43, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_readfirstlane_b32 s44, v14 ; SI-NEXT: s_or_b32 s43, s44, s43 -; SI-NEXT: s_lshr_b32 s70, s43, 8 +; SI-NEXT: s_lshr_b32 s35, s43, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_readfirstlane_b32 s44, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s44, s44, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_mov_b32_e32 v1, v3 -; SI-NEXT: v_bfe_u32 v47, v1, 8, 8 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_bfe_u32 v1, v26, 8, 8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_bfe_u32 v16, v56, 8, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_readfirstlane_b32 s45, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s44, s45, s44 ; SI-NEXT: v_readfirstlane_b32 s45, v13 -; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_lshl_b32 s45, s45, 16 ; SI-NEXT: v_readfirstlane_b32 s46, v14 ; SI-NEXT: s_or_b32 s45, s46, s45 ; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 4 -; SI-NEXT: v_writelane_b32 v62, s47, 5 -; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 16 ; SI-NEXT: v_writelane_b32 v62, s46, 2 ; SI-NEXT: v_writelane_b32 v62, s47, 3 ; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 8 ; SI-NEXT: v_writelane_b32 v62, s46, 0 ; SI-NEXT: v_writelane_b32 v62, s47, 1 ; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 10 -; SI-NEXT: v_writelane_b32 v62, s47, 11 -; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 16 ; SI-NEXT: v_writelane_b32 v62, s46, 8 ; SI-NEXT: v_writelane_b32 v62, s47, 9 -; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 16 ; SI-NEXT: v_writelane_b32 v62, s46, 6 ; SI-NEXT: v_writelane_b32 v62, s47, 7 +; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 8 +; SI-NEXT: v_writelane_b32 v62, s46, 4 +; SI-NEXT: v_writelane_b32 v62, s47, 5 ; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 16 -; SI-NEXT: v_writelane_b32 v62, s47, 17 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 ; SI-NEXT: v_writelane_b32 v62, s46, 14 ; SI-NEXT: v_writelane_b32 v62, s47, 15 -; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 ; SI-NEXT: v_writelane_b32 v62, s46, 12 ; SI-NEXT: v_writelane_b32 v62, s47, 13 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v62, s46, 10 +; SI-NEXT: v_writelane_b32 v62, s47, 11 ; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 22 -; SI-NEXT: v_writelane_b32 v62, s47, 23 -; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 16 ; SI-NEXT: v_writelane_b32 v62, s46, 20 ; SI-NEXT: v_writelane_b32 v62, s47, 21 -; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 16 ; SI-NEXT: v_writelane_b32 v62, s46, 18 ; SI-NEXT: v_writelane_b32 v62, s47, 19 +; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 8 +; SI-NEXT: v_writelane_b32 v62, s46, 16 +; SI-NEXT: v_writelane_b32 v62, s47, 17 ; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 28 -; SI-NEXT: v_writelane_b32 v62, s47, 29 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 ; SI-NEXT: v_writelane_b32 v62, s46, 26 ; SI-NEXT: v_writelane_b32 v62, s47, 27 -; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 ; SI-NEXT: v_writelane_b32 v62, s46, 24 ; SI-NEXT: v_writelane_b32 v62, s47, 25 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 +; SI-NEXT: v_writelane_b32 v62, s46, 22 +; SI-NEXT: v_writelane_b32 v62, s47, 23 ; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 34 -; SI-NEXT: v_writelane_b32 v62, s47, 35 -; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 32 -; SI-NEXT: v_writelane_b32 v62, s47, 33 -; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 8 ; SI-NEXT: v_writelane_b32 v62, s46, 30 ; SI-NEXT: v_writelane_b32 v62, s47, 31 +; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 16 +; SI-NEXT: v_writelane_b32 v62, s46, 28 +; SI-NEXT: v_writelane_b32 v62, s47, 29 ; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 40 -; SI-NEXT: v_writelane_b32 v62, s47, 41 -; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 38 -; SI-NEXT: v_writelane_b32 v62, s47, 39 -; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 8 -; SI-NEXT: v_writelane_b32 v62, s46, 36 -; SI-NEXT: v_writelane_b32 v62, s47, 37 -; SI-NEXT: s_lshr_b64 s[46:47], s[20:21], 24 -; SI-NEXT: v_writelane_b32 v62, s46, 44 -; SI-NEXT: v_writelane_b32 v62, s47, 45 -; SI-NEXT: s_lshr_b64 s[46:47], s[20:21], 16 -; SI-NEXT: v_writelane_b32 v62, s46, 42 -; SI-NEXT: v_writelane_b32 v62, s47, 43 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 48 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 49 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 -; SI-NEXT: v_writelane_b32 v62, vcc_lo, 46 -; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 -; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 -; SI-NEXT: s_lshr_b32 s71, s45, 8 +; SI-NEXT: v_writelane_b32 v62, s46, 32 +; SI-NEXT: v_writelane_b32 v62, s47, 33 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[44:45], 16 +; SI-NEXT: s_lshr_b32 s37, s45, 8 +; SI-NEXT: s_lshr_b32 s47, s5, 8 ; SI-NEXT: .LBB95_5: ; %end ; SI-NEXT: v_readlane_b32 vcc_lo, v62, 0 -; SI-NEXT: s_lshl_b32 s47, vcc_lo, 8 ; SI-NEXT: s_and_b32 s44, s44, 0xff -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 2 -; SI-NEXT: s_or_b32 s44, s44, s47 -; SI-NEXT: s_and_b32 s47, vcc_lo, 0xff -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 4 -; SI-NEXT: s_lshl_b32 s57, vcc_lo, 24 -; SI-NEXT: s_lshl_b32 s47, s47, 16 -; SI-NEXT: s_or_b32 s47, s57, s47 +; SI-NEXT: s_lshl_b32 vcc_lo, vcc_lo, 8 +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 1 +; SI-NEXT: s_or_b32 s44, s44, vcc_lo +; SI-NEXT: s_and_b32 vcc_lo, s48, 0xff +; SI-NEXT: v_readlane_b32 s48, v62, 2 +; SI-NEXT: s_lshl_b32 vcc_lo, vcc_lo, 16 +; SI-NEXT: s_lshl_b32 vcc_hi, s48, 24 +; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo ; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: s_or_b32 s44, s44, s47 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v16, s44 +; SI-NEXT: s_or_b32 s44, s44, vcc_lo +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v14, s44 ; SI-NEXT: s_and_b32 s44, s45, 0xff -; SI-NEXT: s_lshl_b32 s45, s71, 8 +; SI-NEXT: s_lshl_b32 s45, s37, 8 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_or_b32 s44, s44, s45 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v23 ; SI-NEXT: s_and_b32 s44, s44, 0xffff -; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: v_or_b32_e32 v13, s44, v13 -; SI-NEXT: v_readlane_b32 s44, v62, 6 -; SI-NEXT: s_lshl_b32 s44, s44, 8 +; SI-NEXT: v_readlane_b32 s44, v62, 4 ; SI-NEXT: s_and_b32 s42, s42, 0xff -; SI-NEXT: v_readlane_b32 s45, v62, 7 +; SI-NEXT: s_lshl_b32 s44, s44, 8 +; SI-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 4, v0 +; SI-NEXT: v_readlane_b32 s45, v62, 5 ; SI-NEXT: s_or_b32 s42, s42, s44 -; SI-NEXT: v_readlane_b32 s44, v62, 8 -; SI-NEXT: v_readlane_b32 s45, v62, 9 +; SI-NEXT: v_readlane_b32 s44, v62, 6 +; SI-NEXT: v_readlane_b32 s45, v62, 7 ; SI-NEXT: s_and_b32 s44, s44, 0xff -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 10 -; SI-NEXT: s_lshl_b32 s45, vcc_lo, 24 +; SI-NEXT: v_readlane_b32 vcc_lo, v62, 8 ; SI-NEXT: s_lshl_b32 s44, s44, 16 -; SI-NEXT: s_or_b32 s44, s45, s44 +; SI-NEXT: s_lshl_b32 s45, vcc_lo, 24 ; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s44, s45, s44 +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 9 ; SI-NEXT: s_or_b32 s42, s42, s44 -; SI-NEXT: v_mov_b32_e32 v21, s42 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v0 +; SI-NEXT: v_mov_b32_e32 v14, s42 ; SI-NEXT: s_and_b32 s42, s43, 0xff -; SI-NEXT: s_lshl_b32 s43, s70, 8 +; SI-NEXT: s_lshl_b32 s43, s35, 8 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s42, s42, s43 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v50 ; SI-NEXT: s_and_b32 s42, s42, 0xffff -; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_or_b32_e32 v12, s42, v12 -; SI-NEXT: v_readlane_b32 s42, v62, 12 -; SI-NEXT: s_lshl_b32 s42, s42, 8 +; SI-NEXT: v_readlane_b32 s42, v62, 10 ; SI-NEXT: s_and_b32 s40, s40, 0xff -; SI-NEXT: v_readlane_b32 s43, v62, 13 +; SI-NEXT: s_lshl_b32 s42, s42, 8 +; SI-NEXT: v_readlane_b32 s43, v62, 11 ; SI-NEXT: s_or_b32 s40, s40, s42 -; SI-NEXT: v_readlane_b32 s42, v62, 14 -; SI-NEXT: v_readlane_b32 s43, v62, 15 +; SI-NEXT: v_readlane_b32 s42, v62, 12 +; SI-NEXT: v_readlane_b32 s43, v62, 13 ; SI-NEXT: s_and_b32 s42, s42, 0xff -; SI-NEXT: v_readlane_b32 s44, v62, 16 -; SI-NEXT: s_lshl_b32 s43, s44, 24 +; SI-NEXT: v_readlane_b32 s44, v62, 14 ; SI-NEXT: s_lshl_b32 s42, s42, 16 -; SI-NEXT: s_or_b32 s42, s43, s42 +; SI-NEXT: s_lshl_b32 s43, s44, 24 ; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s42, s43, s42 +; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v0 ; SI-NEXT: s_or_b32 s40, s40, s42 -; SI-NEXT: v_mov_b32_e32 v23, s40 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 16, v0 +; SI-NEXT: v_mov_b32_e32 v13, s40 ; SI-NEXT: s_and_b32 s40, s41, 0xff -; SI-NEXT: s_lshl_b32 s41, s69, 8 +; SI-NEXT: s_lshl_b32 s41, s31, 8 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v57 +; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s40, s40, s41 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v50 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v49 ; SI-NEXT: s_and_b32 s40, s40, 0xffff -; SI-NEXT: v_or_b32_e32 v11, v24, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_or_b32_e32 v11, s40, v11 -; SI-NEXT: v_readlane_b32 s40, v62, 18 -; SI-NEXT: s_lshl_b32 s40, s40, 8 +; SI-NEXT: v_readlane_b32 s40, v62, 16 ; SI-NEXT: s_and_b32 s28, s28, 0xff -; SI-NEXT: v_readlane_b32 s41, v62, 19 +; SI-NEXT: s_lshl_b32 s40, s40, 8 +; SI-NEXT: v_readlane_b32 s41, v62, 17 ; SI-NEXT: s_or_b32 s28, s28, s40 -; SI-NEXT: v_readlane_b32 s40, v62, 20 -; SI-NEXT: v_readlane_b32 s41, v62, 21 +; SI-NEXT: v_readlane_b32 s40, v62, 18 +; SI-NEXT: v_readlane_b32 s41, v62, 19 ; SI-NEXT: s_and_b32 s40, s40, 0xff -; SI-NEXT: v_readlane_b32 s42, v62, 22 -; SI-NEXT: s_lshl_b32 s41, s42, 24 +; SI-NEXT: v_readlane_b32 s42, v62, 20 ; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_or_b32 s40, s41, s40 +; SI-NEXT: s_lshl_b32 s41, s42, 24 ; SI-NEXT: s_and_b32 s28, s28, 0xffff +; SI-NEXT: s_or_b32 s40, s41, s40 +; SI-NEXT: v_add_i32_e32 v12, vcc, 20, v0 ; SI-NEXT: s_or_b32 s28, s28, s40 -; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 +; SI-NEXT: v_mov_b32_e32 v12, s28 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s28, s29, 0xff -; SI-NEXT: s_lshl_b32 s29, s68, 8 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v56 +; SI-NEXT: s_lshl_b32 s29, s95, 8 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v33 ; SI-NEXT: s_or_b32 s28, s28, s29 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v38 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v22 ; SI-NEXT: s_and_b32 s28, s28, 0xffff -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_or_b32_e32 v25, s28, v25 -; SI-NEXT: v_readlane_b32 s28, v62, 24 -; SI-NEXT: s_lshl_b32 s28, s28, 8 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v11, s28, v11 +; SI-NEXT: v_readlane_b32 s28, v62, 22 ; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: v_readlane_b32 s29, v62, 25 +; SI-NEXT: s_lshl_b32 s28, s28, 8 +; SI-NEXT: v_readlane_b32 s29, v62, 23 ; SI-NEXT: s_or_b32 s26, s26, s28 -; SI-NEXT: v_readlane_b32 s28, v62, 26 -; SI-NEXT: v_readlane_b32 s29, v62, 27 +; SI-NEXT: v_readlane_b32 s28, v62, 24 +; SI-NEXT: v_readlane_b32 s29, v62, 25 ; SI-NEXT: s_and_b32 s28, s28, 0xff -; SI-NEXT: v_readlane_b32 s40, v62, 28 -; SI-NEXT: s_lshl_b32 s29, s40, 24 +; SI-NEXT: v_readlane_b32 s40, v62, 26 ; SI-NEXT: s_lshl_b32 s28, s28, 16 -; SI-NEXT: s_or_b32 s28, s29, s28 +; SI-NEXT: s_lshl_b32 s29, s40, 24 ; SI-NEXT: s_and_b32 s26, s26, 0xffff +; SI-NEXT: s_or_b32 s28, s29, s28 +; SI-NEXT: v_add_i32_e32 v12, vcc, 28, v0 ; SI-NEXT: s_or_b32 s26, s26, s28 -; SI-NEXT: v_mov_b32_e32 v26, s26 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: v_mov_b32_e32 v12, s26 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s26, s27, 0xff -; SI-NEXT: s_lshl_b32 s27, s66, 8 -; SI-NEXT: v_and_b32_e32 v27, 0xff, v46 +; SI-NEXT: s_lshl_b32 s27, s93, 8 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v61 ; SI-NEXT: s_or_b32 s26, s26, s27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v37 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v48 ; SI-NEXT: s_and_b32 s26, s26, 0xffff -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_or_b32_e32 v27, s26, v27 -; SI-NEXT: v_readlane_b32 s26, v62, 30 -; SI-NEXT: s_lshl_b32 s26, s26, 8 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v11, s26, v11 ; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 1 -; SI-NEXT: v_readlane_b32 s27, v62, 31 +; SI-NEXT: s_lshl_b32 s26, s50, 8 ; SI-NEXT: s_or_b32 s24, s24, s26 -; SI-NEXT: v_readlane_b32 s26, v62, 32 -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 3 -; SI-NEXT: v_readlane_b32 s27, v62, 33 +; SI-NEXT: v_readlane_b32 s26, v62, 28 +; SI-NEXT: v_readlane_b32 s27, v62, 29 ; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: v_readlane_b32 s28, v62, 34 -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 -; SI-NEXT: s_lshl_b32 s27, s28, 24 +; SI-NEXT: v_readlane_b32 s28, v62, 30 ; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 11 -; SI-NEXT: s_or_b32 s26, s27, s26 +; SI-NEXT: s_lshl_b32 s27, s28, 24 ; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s26, s27, s26 +; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 ; SI-NEXT: s_or_b32 s24, s24, s26 -; SI-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v28, s24 +; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; SI-NEXT: v_mov_b32_e32 v12, s24 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s24, s25, 0xff -; SI-NEXT: s_lshl_b32 s25, s64, 8 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v30 -; SI-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v0 +; SI-NEXT: s_lshl_b32 s25, s91, 8 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v60 ; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v35 -; SI-NEXT: buffer_store_dword v21, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v0 -; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v29, s24, v29 -; SI-NEXT: buffer_store_dword v23, v12, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v12, vcc, 20, v0 -; SI-NEXT: v_readlane_b32 s24, v62, 36 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 -; SI-NEXT: s_lshl_b32 s24, s24, 8 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v21 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v11, s24, v11 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: buffer_store_dword v24, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 -; SI-NEXT: v_readlane_b32 s25, v62, 37 +; SI-NEXT: s_lshl_b32 s24, s64, 8 ; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: v_readlane_b32 s24, v62, 38 -; SI-NEXT: buffer_store_dword v25, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 -; SI-NEXT: v_readlane_b32 s25, v62, 39 -; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: v_readlane_b32 s26, v62, 40 -; SI-NEXT: buffer_store_dword v26, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; SI-NEXT: s_and_b32 s24, s52, 0xff +; SI-NEXT: v_readlane_b32 s26, v62, 32 ; SI-NEXT: s_lshl_b32 s24, s24, 16 ; SI-NEXT: s_lshl_b32 s25, s26, 24 -; SI-NEXT: buffer_store_dword v27, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 ; SI-NEXT: s_and_b32 s22, s22, 0xffff ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 +; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 ; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: buffer_store_dword v29, v11, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 ; SI-NEXT: v_mov_b32_e32 v12, s22 ; SI-NEXT: s_and_b32 s22, s23, 0xff -; SI-NEXT: s_lshl_b32 s23, s54, 8 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v32 +; SI-NEXT: s_lshl_b32 s23, s89, 8 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v58 ; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v43 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v39 ; SI-NEXT: s_and_b32 s22, s22, 0xffff ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_or_b32_e32 v10, s22, v10 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s22, s62, 8 +; SI-NEXT: s_lshl_b32 s22, s70, 8 ; SI-NEXT: s_or_b32 s20, s20, s22 -; SI-NEXT: v_readlane_b32 s22, v62, 42 -; SI-NEXT: v_readlane_b32 s23, v62, 43 -; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: v_readlane_b32 s24, v62, 44 +; SI-NEXT: s_and_b32 s22, s66, 0xff ; SI-NEXT: s_lshl_b32 s22, s22, 16 -; SI-NEXT: s_lshl_b32 s23, s24, 24 +; SI-NEXT: s_lshl_b32 s23, s54, 24 ; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 @@ -192948,8 +192317,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 ; SI-NEXT: v_mov_b32_e32 v11, s20 ; SI-NEXT: s_and_b32 s20, s21, 0xff -; SI-NEXT: s_lshl_b32 s21, s52, 8 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v31 +; SI-NEXT: s_lshl_b32 s21, s79, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v3 ; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -192958,11 +192327,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v9, s20, v9 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s20, s58, 8 +; SI-NEXT: s_lshl_b32 s20, s84, 8 ; SI-NEXT: s_or_b32 s18, s18, s20 -; SI-NEXT: s_and_b32 s20, s98, 0xff +; SI-NEXT: s_and_b32 s20, s80, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_lshl_b32 s21, s96, 24 +; SI-NEXT: s_lshl_b32 s21, s68, 24 ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s20, s21, s20 ; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 @@ -192972,21 +192341,22 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 ; SI-NEXT: v_mov_b32_e32 v10, s18 ; SI-NEXT: s_and_b32 s18, s19, 0xff -; SI-NEXT: s_lshl_b32 s19, s50, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v54 +; SI-NEXT: s_lshl_b32 s19, s75, 8 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v2 ; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v38 ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_or_b32_e32 v8, s18, v8 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s18, s38, 8 +; SI-NEXT: s_lshl_b32 s18, s98, 8 ; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: s_and_b32 s18, s36, 0xff +; SI-NEXT: s_and_b32 s18, s86, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s34, 24 +; SI-NEXT: s_lshl_b32 s19, s82, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 @@ -192996,21 +192366,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 ; SI-NEXT: v_mov_b32_e32 v9, s16 ; SI-NEXT: s_and_b32 s16, s17, 0xff -; SI-NEXT: s_lshl_b32 s17, s48, 8 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v44 +; SI-NEXT: s_lshl_b32 s17, s63, 8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 ; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v18 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v1 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_or_b32_e32 v7, s16, v7 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s16, s30, 8 +; SI-NEXT: s_lshl_b32 s16, s58, 8 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: s_and_b32 s16, s94, 0xff +; SI-NEXT: s_and_b32 s16, s46, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s92, 24 +; SI-NEXT: s_lshl_b32 s17, s96, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 @@ -193020,21 +192390,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 ; SI-NEXT: v_mov_b32_e32 v8, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xff -; SI-NEXT: s_lshl_b32 s15, s67, 8 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: s_lshl_b32 s15, s77, 8 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v28 ; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v47 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_or_b32_e32 v6, s14, v6 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s14, s90, 8 +; SI-NEXT: s_lshl_b32 s14, s72, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: s_and_b32 s14, s88, 0xff +; SI-NEXT: s_and_b32 s14, s60, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s15, s78, 24 +; SI-NEXT: s_lshl_b32 s15, s56, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 @@ -193044,21 +192414,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 ; SI-NEXT: v_mov_b32_e32 v7, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff -; SI-NEXT: s_lshl_b32 s13, s65, 8 +; SI-NEXT: s_lshl_b32 s13, s73, 8 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v43 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_or_b32_e32 v5, s12, v5 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s12, s76, 8 +; SI-NEXT: s_lshl_b32 s12, s78, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: s_and_b32 s12, s74, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s13, s72, 24 +; SI-NEXT: s_lshl_b32 s13, s62, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 @@ -193068,21 +192438,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: s_lshl_b32 s11, s55, 8 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v55 +; SI-NEXT: s_lshl_b32 s11, s61, 8 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v45 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_or_b32_e32 v4, s10, v4 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s10, s60, 8 +; SI-NEXT: s_lshl_b32 s10, s92, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: s_and_b32 s10, s56, 0xff +; SI-NEXT: s_and_b32 s10, s88, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s11, s46, 24 +; SI-NEXT: s_lshl_b32 s11, s76, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 @@ -193092,21 +192462,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 ; SI-NEXT: v_mov_b32_e32 v5, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff -; SI-NEXT: s_lshl_b32 s9, s53, 8 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_lshl_b32 s9, s59, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v37 ; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v18 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_or_b32_e32 v2, s8, v2 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s8, s86, 8 +; SI-NEXT: s_lshl_b32 s8, s34, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s84, 0xff +; SI-NEXT: s_and_b32 s8, s94, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s82, 24 +; SI-NEXT: s_lshl_b32 s9, s90, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 @@ -193116,24 +192486,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: v_mov_b32_e32 v4, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: s_lshl_b32 s7, s51, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_lshl_b32 s7, s57, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v47 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v55 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v1, s6, v1 -; SI-NEXT: v_readlane_b32 s6, v62, 46 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: v_readlane_b32 s7, v62, 47 +; SI-NEXT: s_lshl_b32 s6, s38, 8 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s80, 0xff -; SI-NEXT: v_readlane_b32 s8, v62, 48 +; SI-NEXT: s_and_b32 s6, s36, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s8, 24 +; SI-NEXT: s_lshl_b32 s7, s30, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 @@ -193144,12 +192511,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: s_lshl_b32 s5, s49, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: s_lshl_b32 s5, s47, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 @@ -193169,13 +192536,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s45, v62, 17 -; SI-NEXT: v_readlane_b32 s43, v62, 23 -; SI-NEXT: v_readlane_b32 s41, v62, 29 -; SI-NEXT: v_readlane_b32 s29, v62, 35 -; SI-NEXT: v_readlane_b32 s27, v62, 41 -; SI-NEXT: v_readlane_b32 s25, v62, 45 -; SI-NEXT: v_readlane_b32 s9, v62, 49 +; SI-NEXT: v_readlane_b32 s49, v62, 3 +; SI-NEXT: v_readlane_b32 s45, v62, 15 +; SI-NEXT: v_readlane_b32 s43, v62, 21 +; SI-NEXT: v_readlane_b32 s41, v62, 27 +; SI-NEXT: v_readlane_b32 s29, v62, 31 +; SI-NEXT: v_readlane_b32 s27, v62, 33 ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -193213,8 +192579,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -193223,8 +192589,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v63, s30, 0 ; VI-NEXT: v_writelane_b32 v63, s31, 1 @@ -193287,7 +192653,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; VI-NEXT: v_writelane_b32 v63, s87, 31 -; VI-NEXT: v_mov_b32_e32 v39, v0 ; VI-NEXT: v_readfirstlane_b32 s20, v17 ; VI-NEXT: v_readfirstlane_b32 s21, v18 ; VI-NEXT: v_readfirstlane_b32 s18, v3 @@ -193325,552 +192690,529 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: s_cbranch_scc0 .LBB95_3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b32 s26, s5, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 26 +; VI-NEXT: v_writelane_b32 v62, s26, 39 ; VI-NEXT: s_lshr_b32 s26, s5, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 15 +; VI-NEXT: v_writelane_b32 v62, s26, 31 ; VI-NEXT: s_lshr_b32 s26, s5, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 57 ; VI-NEXT: s_lshr_b32 s26, s4, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 14 +; VI-NEXT: v_writelane_b32 v62, s26, 30 ; VI-NEXT: s_lshr_b32 s26, s4, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 56 ; VI-NEXT: s_lshr_b32 s26, s7, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 25 +; VI-NEXT: v_writelane_b32 v62, s26, 38 ; VI-NEXT: s_lshr_b32 s26, s7, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 13 +; VI-NEXT: v_writelane_b32 v62, s26, 29 ; VI-NEXT: s_lshr_b32 s26, s7, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 55 ; VI-NEXT: s_lshr_b32 s26, s6, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 12 +; VI-NEXT: v_writelane_b32 v62, s26, 28 ; VI-NEXT: s_lshr_b32 s26, s6, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 54 ; VI-NEXT: s_lshr_b32 s26, s9, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 24 +; VI-NEXT: v_writelane_b32 v62, s26, 37 ; VI-NEXT: s_lshr_b32 s26, s9, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 11 +; VI-NEXT: v_writelane_b32 v62, s26, 27 ; VI-NEXT: s_lshr_b32 s26, s9, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 53 ; VI-NEXT: s_lshr_b32 s26, s8, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 10 +; VI-NEXT: v_writelane_b32 v62, s26, 26 ; VI-NEXT: s_lshr_b32 s26, s8, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 52 ; VI-NEXT: s_lshr_b32 s26, s11, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 23 +; VI-NEXT: v_writelane_b32 v62, s26, 36 ; VI-NEXT: s_lshr_b32 s26, s11, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 9 +; VI-NEXT: v_writelane_b32 v62, s26, 25 ; VI-NEXT: s_lshr_b32 s26, s11, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 51 ; VI-NEXT: s_lshr_b32 s26, s10, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 24 ; VI-NEXT: s_lshr_b32 s26, s10, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 50 ; VI-NEXT: s_lshr_b32 s26, s13, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 22 +; VI-NEXT: v_writelane_b32 v62, s26, 35 ; VI-NEXT: s_lshr_b32 s26, s13, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 7 +; VI-NEXT: v_writelane_b32 v62, s26, 23 ; VI-NEXT: s_lshr_b32 s26, s13, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 49 ; VI-NEXT: s_lshr_b32 s26, s12, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 6 +; VI-NEXT: v_writelane_b32 v62, s26, 22 ; VI-NEXT: s_lshr_b32 s26, s12, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 48 ; VI-NEXT: s_lshr_b32 s26, s15, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 21 +; VI-NEXT: v_writelane_b32 v62, s26, 34 ; VI-NEXT: s_lshr_b32 s26, s15, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 5 +; VI-NEXT: v_writelane_b32 v62, s26, 21 ; VI-NEXT: s_lshr_b32 s26, s15, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 47 ; VI-NEXT: s_lshr_b32 s26, s14, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 4 +; VI-NEXT: v_writelane_b32 v62, s26, 20 ; VI-NEXT: s_lshr_b32 s26, s14, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 46 ; VI-NEXT: s_lshr_b32 s26, s17, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 20 +; VI-NEXT: v_writelane_b32 v62, s26, 33 ; VI-NEXT: s_lshr_b32 s26, s17, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 3 +; VI-NEXT: v_writelane_b32 v62, s26, 19 ; VI-NEXT: s_lshr_b32 s26, s17, 8 ; VI-NEXT: v_writelane_b32 v62, s26, 45 -; VI-NEXT: s_lshr_b32 s26, s16, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 2 -; VI-NEXT: s_lshr_b32 s26, s16, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 44 -; VI-NEXT: s_lshr_b32 s26, s19, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 19 -; VI-NEXT: s_lshr_b32 s26, s19, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 1 -; VI-NEXT: s_lshr_b32 s26, s19, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 43 -; VI-NEXT: s_lshr_b32 s26, s18, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 0 -; VI-NEXT: s_lshr_b32 s26, s18, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 42 -; VI-NEXT: s_lshr_b32 s26, s21, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 18 -; VI-NEXT: s_lshr_b32 s26, s21, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 41 -; VI-NEXT: s_lshr_b32 s26, s20, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 40 -; VI-NEXT: s_lshr_b32 s26, s23, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 17 -; VI-NEXT: s_lshr_b32 s26, s23, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 39 -; VI-NEXT: s_lshr_b32 s26, s22, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 38 -; VI-NEXT: s_lshr_b32 s26, s25, 24 -; VI-NEXT: v_writelane_b32 v62, s26, 16 -; VI-NEXT: s_lshr_b32 s26, s25, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 37 -; VI-NEXT: s_lshr_b32 s26, s24, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 36 -; VI-NEXT: s_lshr_b32 s26, s41, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 35 -; VI-NEXT: s_lshr_b32 s26, s40, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 34 -; VI-NEXT: s_lshr_b32 s26, s43, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 33 -; VI-NEXT: s_lshr_b32 s26, s42, 8 +; VI-NEXT: s_lshr_b32 s26, s16, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 18 +; VI-NEXT: s_lshr_b32 s26, s16, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 44 +; VI-NEXT: s_lshr_b32 s26, s19, 24 ; VI-NEXT: v_writelane_b32 v62, s26, 32 -; VI-NEXT: s_lshr_b32 s26, s45, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 31 -; VI-NEXT: s_lshr_b32 s26, s44, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 30 -; VI-NEXT: s_lshr_b32 s26, s47, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 29 -; VI-NEXT: s_lshr_b32 s26, s46, 8 -; VI-NEXT: v_writelane_b32 v62, s26, 28 -; VI-NEXT: s_lshr_b32 s26, s57, 8 -; VI-NEXT: s_lshr_b32 s86, s21, 16 -; VI-NEXT: s_lshr_b32 s87, s20, 16 -; VI-NEXT: s_lshr_b32 s50, s23, 16 -; VI-NEXT: s_lshr_b32 s51, s22, 16 -; VI-NEXT: s_lshr_b32 s52, s25, 16 -; VI-NEXT: s_lshr_b32 s53, s24, 16 +; VI-NEXT: s_lshr_b32 s26, s19, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 17 +; VI-NEXT: s_lshr_b32 s26, s19, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 43 +; VI-NEXT: s_lshr_b32 s26, s18, 16 +; VI-NEXT: v_writelane_b32 v62, s26, 16 +; VI-NEXT: s_lshr_b32 s26, s18, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 42 +; VI-NEXT: s_lshr_b32 s26, s21, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 41 +; VI-NEXT: s_lshr_b32 s26, s20, 8 +; VI-NEXT: v_writelane_b32 v62, s26, 40 +; VI-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 14 +; VI-NEXT: v_writelane_b32 v62, s79, 15 +; VI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 12 +; VI-NEXT: v_writelane_b32 v62, s79, 13 +; VI-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 10 +; VI-NEXT: v_writelane_b32 v62, s79, 11 +; VI-NEXT: s_lshr_b64 s[78:79], s[10:11], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 8 +; VI-NEXT: v_writelane_b32 v62, s79, 9 +; VI-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 6 +; VI-NEXT: v_writelane_b32 v62, s79, 7 +; VI-NEXT: s_lshr_b64 s[78:79], s[14:15], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 4 +; VI-NEXT: v_writelane_b32 v62, s79, 5 +; VI-NEXT: s_lshr_b64 s[78:79], s[16:17], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 2 +; VI-NEXT: v_writelane_b32 v62, s79, 3 +; VI-NEXT: s_lshr_b64 s[78:79], s[18:19], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 0 +; VI-NEXT: s_lshr_b32 s66, s21, 24 +; VI-NEXT: s_lshr_b32 s28, s21, 16 +; VI-NEXT: s_lshr_b32 s29, s20, 16 +; VI-NEXT: s_lshr_b32 s69, s23, 24 +; VI-NEXT: s_lshr_b32 s58, s23, 16 +; VI-NEXT: s_lshr_b32 s86, s23, 8 +; VI-NEXT: s_lshr_b32 s59, s22, 16 +; VI-NEXT: s_lshr_b32 s87, s22, 8 +; VI-NEXT: s_lshr_b32 s71, s25, 24 +; VI-NEXT: s_lshr_b32 s60, s25, 16 +; VI-NEXT: s_lshr_b32 s50, s25, 8 +; VI-NEXT: s_lshr_b32 s61, s24, 16 +; VI-NEXT: s_lshr_b32 s51, s24, 8 ; VI-NEXT: s_lshr_b32 s81, s41, 24 -; VI-NEXT: s_lshr_b32 s54, s41, 16 -; VI-NEXT: s_lshr_b32 s55, s40, 16 +; VI-NEXT: s_lshr_b32 s62, s41, 16 +; VI-NEXT: s_lshr_b32 s52, s41, 8 +; VI-NEXT: s_lshr_b32 s63, s40, 16 +; VI-NEXT: s_lshr_b32 s53, s40, 8 ; VI-NEXT: s_lshr_b32 s82, s43, 24 -; VI-NEXT: s_lshr_b32 s64, s43, 16 -; VI-NEXT: s_lshr_b32 s65, s42, 16 +; VI-NEXT: s_lshr_b32 s72, s43, 16 +; VI-NEXT: s_lshr_b32 s54, s43, 8 +; VI-NEXT: s_lshr_b32 s73, s42, 16 +; VI-NEXT: s_lshr_b32 s55, s42, 8 ; VI-NEXT: s_lshr_b32 s83, s45, 24 -; VI-NEXT: s_lshr_b32 s66, s45, 16 -; VI-NEXT: s_lshr_b32 s67, s44, 16 -; VI-NEXT: s_lshr_b32 s84, s47, 24 -; VI-NEXT: s_lshr_b32 s68, s47, 16 -; VI-NEXT: s_lshr_b32 s69, s46, 16 -; VI-NEXT: s_lshr_b32 s85, s57, 24 -; VI-NEXT: s_lshr_b32 s70, s57, 16 -; VI-NEXT: v_writelane_b32 v62, s26, 27 -; VI-NEXT: s_lshr_b32 s71, s56, 16 +; VI-NEXT: s_lshr_b32 s74, s45, 16 +; VI-NEXT: s_lshr_b32 s64, s45, 8 +; VI-NEXT: s_lshr_b32 s75, s44, 16 +; VI-NEXT: s_lshr_b32 s65, s44, 8 +; VI-NEXT: s_lshr_b32 s26, s47, 24 +; VI-NEXT: s_lshr_b32 s76, s47, 16 +; VI-NEXT: s_lshr_b32 s67, s47, 8 +; VI-NEXT: s_lshr_b32 s77, s46, 16 +; VI-NEXT: s_lshr_b32 s68, s46, 8 +; VI-NEXT: s_lshr_b32 s27, s57, 24 +; VI-NEXT: s_lshr_b32 s84, s57, 16 +; VI-NEXT: s_lshr_b32 s70, s57, 8 +; VI-NEXT: s_lshr_b32 s85, s56, 16 ; VI-NEXT: s_lshr_b32 s80, s56, 8 -; VI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 -; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 -; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[46:47], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[56:57], 24 +; VI-NEXT: v_writelane_b32 v62, s79, 1 +; VI-NEXT: s_lshr_b64 s[48:49], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[42:43], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[46:47], 24 +; VI-NEXT: s_lshr_b64 s[78:79], s[56:57], 24 ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true ; VI-NEXT: s_lshr_b32 s26, s57, 16 -; VI-NEXT: v_mov_b32_e32 v9, 0x200 -; VI-NEXT: v_add_f16_e32 v0, s26, v9 +; VI-NEXT: v_mov_b32_e32 v8, 0x200 +; VI-NEXT: v_add_f16_e32 v32, s26, v8 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_f16_e32 v2, s57, v8 ; VI-NEXT: s_lshr_b32 s26, s56, 16 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_add_f16_e32 v2, s57, v9 -; VI-NEXT: v_add_f16_e32 v0, s26, v9 +; VI-NEXT: v_or_b32_e32 v21, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s26, v8 ; VI-NEXT: s_lshr_b32 s26, s47, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v35, v2, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_add_f16_e32 v2, s56, v9 -; VI-NEXT: v_add_f16_e32 v0, s26, v9 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v2, s56, v8 +; VI-NEXT: v_add_f16_e32 v60, s26, v8 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v20, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; VI-NEXT: v_add_f16_e32 v2, s47, v8 ; VI-NEXT: s_lshr_b32 s26, s46, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v34, v2, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_add_f16_e32 v2, s47, v9 -; VI-NEXT: v_add_f16_e32 v0, s26, v9 -; VI-NEXT: s_lshr_b32 s26, s45, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v23, v2, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_add_f16_e32 v2, s46, v9 -; VI-NEXT: v_add_f16_e32 v0, s26, v9 +; VI-NEXT: v_add_f16_e32 v1, s26, v8 +; VI-NEXT: s_lshr_b32 s26, s45, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v2, s46, v8 +; VI-NEXT: v_add_f16_e32 v33, s26, v8 ; VI-NEXT: s_lshr_b32 s26, s44, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v22, v2, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_add_f16_e32 v2, s45, v9 -; VI-NEXT: v_add_f16_e32 v0, s26, v9 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; VI-NEXT: v_add_f16_e32 v2, s45, v8 +; VI-NEXT: v_add_f16_e32 v47, s26, v8 ; VI-NEXT: s_lshr_b32 s26, s43, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v21, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_add_f16_e32 v2, s44, v9 -; VI-NEXT: v_add_f16_e32 v58, s26, v9 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v20, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 -; VI-NEXT: v_add_f16_e32 v2, s43, v9 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v11, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; VI-NEXT: v_add_f16_e32 v2, s44, v8 +; VI-NEXT: v_add_f16_e32 v56, s26, v8 ; VI-NEXT: s_lshr_b32 s26, s42, 16 -; VI-NEXT: v_or_b32_e32 v25, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s26, v9 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v10, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; VI-NEXT: v_add_f16_e32 v2, s43, v8 +; VI-NEXT: v_add_f16_e32 v34, s26, v8 ; VI-NEXT: s_lshr_b32 s26, s41, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v2, s42, v9 -; VI-NEXT: v_add_f16_e32 v46, s26, v9 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v25, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; VI-NEXT: v_add_f16_e32 v2, s42, v8 +; VI-NEXT: v_add_f16_e32 v57, s26, v8 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v24, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 -; VI-NEXT: v_add_f16_e32 v2, s41, v9 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; VI-NEXT: v_add_f16_e32 v2, s41, v8 ; VI-NEXT: s_lshr_b32 s26, s40, 16 ; VI-NEXT: v_or_b32_e32 v7, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s26, v9 +; VI-NEXT: v_add_f16_e32 v1, s26, v8 ; VI-NEXT: s_lshr_b32 s26, s25, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v2, s40, v9 -; VI-NEXT: v_add_f16_e32 v47, s26, v9 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v2, s40, v8 +; VI-NEXT: v_add_f16_e32 v58, s26, v8 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v6, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 -; VI-NEXT: v_add_f16_e32 v2, s25, v9 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; VI-NEXT: v_add_f16_e32 v2, s25, v8 ; VI-NEXT: s_lshr_b32 s25, s24, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v27, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s25, v9 -; VI-NEXT: v_add_f16_e32 v2, s24, v9 +; VI-NEXT: v_add_f16_e32 v1, s25, v8 +; VI-NEXT: v_add_f16_e32 v2, s24, v8 ; VI-NEXT: s_lshr_b32 s24, s23, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v44, s24, v9 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v59, s24, v8 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v26, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; VI-NEXT: v_add_f16_e32 v2, s23, v9 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; VI-NEXT: v_add_f16_e32 v2, s23, v8 ; VI-NEXT: s_lshr_b32 s23, s22, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v4, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s23, v9 -; VI-NEXT: v_add_f16_e32 v2, s22, v9 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v5, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s23, v8 +; VI-NEXT: v_add_f16_e32 v2, s22, v8 ; VI-NEXT: s_lshr_b32 s22, s21, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v60, s22, v9 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v3, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 -; VI-NEXT: v_add_f16_e32 v2, s21, v9 +; VI-NEXT: v_add_f16_e32 v41, s22, v8 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v4, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; VI-NEXT: v_add_f16_e32 v2, s21, v8 ; VI-NEXT: s_lshr_b32 s21, s20, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v2, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s21, v9 -; VI-NEXT: v_add_f16_e32 v10, s20, v9 -; VI-NEXT: s_lshr_b32 s20, s19, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v29, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s21, v8 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v43, s20, v9 -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v1, v10, v1 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 -; VI-NEXT: v_add_f16_e32 v51, s19, v9 +; VI-NEXT: v_add_f16_e32 v2, s20, v8 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v28, v2, v1 +; VI-NEXT: s_lshr_b32 s20, s19, 16 +; VI-NEXT: v_add_f16_e32 v2, s19, v8 ; VI-NEXT: s_lshr_b32 s19, s18, 16 -; VI-NEXT: v_or_b32_e32 v29, v51, v10 -; VI-NEXT: v_add_f16_e32 v10, s19, v9 -; VI-NEXT: v_add_f16_e32 v54, s18, v9 +; VI-NEXT: v_add_f16_e32 v50, s18, v8 ; VI-NEXT: s_lshr_b32 s18, s17, 16 -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_add_f16_e32 v45, s18, v9 -; VI-NEXT: v_add_f16_e32 v11, s17, v9 +; VI-NEXT: v_add_f16_e32 v51, s18, v8 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; VI-NEXT: v_add_f16_e32 v9, s17, v8 ; VI-NEXT: s_lshr_b32 s17, s16, 16 -; VI-NEXT: v_or_b32_e32 v28, v54, v10 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v45 -; VI-NEXT: v_add_f16_e32 v36, s17, v9 -; VI-NEXT: v_add_f16_e32 v55, s16, v9 +; VI-NEXT: v_or_b32_e32 v31, v9, v3 +; VI-NEXT: v_add_f16_e32 v3, s17, v8 +; VI-NEXT: v_add_f16_e32 v35, s16, v8 ; VI-NEXT: s_lshr_b32 s16, s15, 16 -; VI-NEXT: v_or_b32_e32 v19, v11, v10 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v36 -; VI-NEXT: v_add_f16_e32 v52, s16, v9 -; VI-NEXT: v_or_b32_e32 v18, v55, v10 -; VI-NEXT: s_lshr_b32 s17, s14, 16 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v52 -; VI-NEXT: v_add_f16_e32 v13, s15, v9 -; VI-NEXT: v_or_b32_e32 v31, v13, v10 -; VI-NEXT: v_add_f16_e32 v10, s17, v9 -; VI-NEXT: v_add_f16_e32 v59, s14, v9 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_f16_e32 v52, s16, v8 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v30, v35, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; VI-NEXT: v_add_f16_e32 v9, s15, v8 +; VI-NEXT: s_lshr_b32 s15, s14, 16 +; VI-NEXT: v_or_b32_e32 v19, v9, v3 +; VI-NEXT: v_add_f16_e32 v3, s15, v8 +; VI-NEXT: v_add_f16_e32 v36, s14, v8 ; VI-NEXT: s_lshr_b32 s14, s13, 16 -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; VI-NEXT: s_lshr_b32 s15, s12, 16 -; VI-NEXT: v_add_f16_e32 v50, s14, v9 -; VI-NEXT: v_or_b32_e32 v30, v59, v10 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v50 -; VI-NEXT: v_add_f16_e32 v53, s13, v9 -; VI-NEXT: v_add_f16_e32 v8, s15, v9 -; VI-NEXT: v_add_f16_e32 v41, s12, v9 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_f16_e32 v53, s14, v8 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v18, v36, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; VI-NEXT: v_add_f16_e32 v9, s13, v8 +; VI-NEXT: s_lshr_b32 s13, s12, 16 +; VI-NEXT: v_or_b32_e32 v46, v9, v3 +; VI-NEXT: v_add_f16_e32 v3, s13, v8 +; VI-NEXT: v_add_f16_e32 v37, s12, v8 ; VI-NEXT: s_lshr_b32 s12, s11, 16 -; VI-NEXT: v_or_b32_e32 v17, v53, v10 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v8 -; VI-NEXT: s_lshr_b32 s13, s10, 16 -; VI-NEXT: v_add_f16_e32 v5, s12, v9 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v16, v41, v10 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; VI-NEXT: v_add_f16_e32 v57, s11, v9 -; VI-NEXT: v_add_f16_e32 v38, s13, v9 -; VI-NEXT: v_add_f16_e32 v11, s10, v9 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_f16_e32 v54, s12, v8 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v45, v37, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; VI-NEXT: v_add_f16_e32 v9, s11, v8 +; VI-NEXT: s_lshr_b32 s11, s10, 16 +; VI-NEXT: v_or_b32_e32 v17, v9, v3 +; VI-NEXT: v_add_f16_e32 v3, s11, v8 +; VI-NEXT: v_add_f16_e32 v38, s10, v8 ; VI-NEXT: s_lshr_b32 s10, s9, 16 -; VI-NEXT: v_or_b32_e32 v33, v57, v10 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v38 -; VI-NEXT: s_lshr_b32 s11, s8, 16 -; VI-NEXT: v_add_f16_e32 v37, s10, v9 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v32, v11, v10 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; VI-NEXT: v_add_f16_e32 v61, s9, v9 -; VI-NEXT: v_add_f16_e32 v48, s11, v9 -; VI-NEXT: v_add_f16_e32 v11, s8, v9 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_f16_e32 v40, s10, v8 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v16, v38, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; VI-NEXT: v_add_f16_e32 v9, s9, v8 +; VI-NEXT: s_lshr_b32 s9, s8, 16 +; VI-NEXT: v_or_b32_e32 v49, v9, v3 +; VI-NEXT: v_add_f16_e32 v3, s9, v8 +; VI-NEXT: v_add_f16_e32 v39, s8, v8 ; VI-NEXT: s_lshr_b32 s8, s7, 16 -; VI-NEXT: v_or_b32_e32 v15, v61, v10 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 -; VI-NEXT: v_add_f16_e32 v49, s8, v9 -; VI-NEXT: v_or_b32_e32 v14, v11, v10 -; VI-NEXT: s_lshr_b32 s9, s6, 16 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v49 -; VI-NEXT: v_add_f16_e32 v56, s7, v9 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v12, v56, v10 -; VI-NEXT: v_add_f16_e32 v10, s9, v9 -; VI-NEXT: v_add_f16_e32 v11, s6, v9 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_f16_e32 v42, s8, v8 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v48, v39, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; VI-NEXT: v_add_f16_e32 v9, s7, v8 +; VI-NEXT: s_lshr_b32 s7, s6, 16 +; VI-NEXT: v_or_b32_e32 v15, v9, v3 +; VI-NEXT: v_add_f16_e32 v3, s7, v8 +; VI-NEXT: v_add_f16_e32 v43, s6, v8 ; VI-NEXT: s_lshr_b32 s6, s5, 16 -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; VI-NEXT: s_lshr_b32 s7, s4, 16 -; VI-NEXT: v_add_f16_e32 v40, s6, v9 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v11, v11, v10 -; VI-NEXT: v_add_f16_e32 v10, s5, v9 -; VI-NEXT: v_add_f16_e32 v0, s7, v9 -; VI-NEXT: v_add_f16_e32 v42, s4, v9 -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v40 -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v10, v10, v9 -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v9, v42, v9 -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v9 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v15 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v14 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v17 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[1:2] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[3:4] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[26:27] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v25 -; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[24:25] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[14:15] -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[32:33] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v16 -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[16:17] -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v21 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[30:31] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v20 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v18 -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[18:19] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v23 -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v19 -; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[28:29] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v22 -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[6:7], 24, v[20:21] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[22:23] -; VI-NEXT: v_bfe_u32 v0, v60, 8, 8 -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v20, v40, 8, 8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v42, v40 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[34:35] -; VI-NEXT: v_mov_b32_e32 v4, v5 -; VI-NEXT: v_bfe_u32 v27, v5, 8, 8 -; VI-NEXT: v_mov_b32_e32 v5, v50 -; VI-NEXT: v_bfe_u32 v1, v50, 8, 8 -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v25, v52 -; VI-NEXT: v_bfe_u32 v23, v52, 8, 8 -; VI-NEXT: v_mov_b32_e32 v52, v36 -; VI-NEXT: v_mov_b32_e32 v36, v44 -; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v34 -; VI-NEXT: v_bfe_u32 v34, v36, 8, 8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v34, v47, 8, 8 -; VI-NEXT: v_mov_b32_e32 v44, v58 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v34, v46, 8, 8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v34, v44, 8, 8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v31 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v29 -; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v28 -; VI-NEXT: v_mov_b32_e32 v2, v54 -; VI-NEXT: v_bfe_u32 v21, v49, 8, 8 -; VI-NEXT: v_bfe_u32 v54, v37, 8, 8 -; VI-NEXT: v_bfe_u32 v28, v45, 8, 8 -; VI-NEXT: v_bfe_u32 v29, v43, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_bfe_u32 v34, v0, 8, 8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_bfe_u32 v34, v40, 8, 8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_bfe_u32 v34, v60, 8, 8 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; VI-NEXT: v_add_f16_e32 v44, s6, v8 +; VI-NEXT: v_add_f16_e32 v61, s20, v8 +; VI-NEXT: v_or_b32_e32 v14, v43, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 +; VI-NEXT: v_add_f16_e32 v12, s5, v8 +; VI-NEXT: s_lshr_b32 s5, s4, 16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v9, v12, v9 +; VI-NEXT: v_add_f16_e32 v12, s5, v8 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v2, v2, v1 +; VI-NEXT: v_add_f16_e32 v1, s19, v8 +; VI-NEXT: v_add_f16_e32 v55, s4, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v8, v55, v8 +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v9 +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[8:9] +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v8 +; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[14:15] +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v15 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v14 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[48:49] +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v49 +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v48 +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[16:17] +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v16 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[45:46] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[28:29] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v17 +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[18:19] +; VI-NEXT: v_or_b32_e32 v1, v50, v1 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v45 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v18 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[30:31] +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v46 +; VI-NEXT: v_mov_b32_e32 v46, v41 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v19 +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v29 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v28 +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v4 +; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[26:27] +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v27 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v26 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v7 +; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[6:7] +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v6 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v25 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v24 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[24:25] +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[6:7], 24, v[10:11] +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[20:21] +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v21 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; VI-NEXT: v_bfe_u32 v20, v46, 8, 8 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v20, v59, 8, 8 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v20, v58, 8, 8 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v20, v57, 8, 8 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v20, v56, 8, 8 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v20, v33, 8, 8 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v20, v60, 8, 8 +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v46, v50 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v20, v32, 8, 8 +; VI-NEXT: v_mov_b32_e32 v32, v60 +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[22:23] +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v23 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v22 +; VI-NEXT: v_bfe_u32 v29, v44, 8, 8 +; VI-NEXT: v_bfe_u32 v11, v42, 8, 8 +; VI-NEXT: v_bfe_u32 v2, v40, 8, 8 +; VI-NEXT: v_bfe_u32 v22, v54, 8, 8 +; VI-NEXT: v_bfe_u32 v25, v53, 8, 8 +; VI-NEXT: v_bfe_u32 v30, v52, 8, 8 +; VI-NEXT: v_bfe_u32 v23, v51, 8, 8 +; VI-NEXT: v_bfe_u32 v7, v61, 8, 8 ; VI-NEXT: s_branch .LBB95_5 ; VI-NEXT: .LBB95_3: ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; kill: killed $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr80 -; VI-NEXT: ; implicit-def: $sgpr71 -; VI-NEXT: ; implicit-def: $sgpr70 ; VI-NEXT: ; implicit-def: $sgpr85 -; VI-NEXT: ; implicit-def: $sgpr69 -; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr70 ; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr68 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr88 ; VI-NEXT: ; implicit-def: $sgpr67 -; VI-NEXT: ; implicit-def: $sgpr66 -; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr90 ; VI-NEXT: ; implicit-def: $sgpr64 -; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr83 ; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr54 -; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr82 ; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr34 ; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr81 ; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr36 ; VI-NEXT: ; implicit-def: $sgpr50 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr71 ; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr38 ; VI-NEXT: ; implicit-def: $sgpr86 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr29 ; VI-NEXT: ; implicit-def: $sgpr48 -; VI-NEXT: ; implicit-def: $sgpr38 -; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr30 -; VI-NEXT: ; implicit-def: $sgpr90 -; VI-NEXT: ; implicit-def: $sgpr88 -; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 -; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; kill: killed $sgpr28 +; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s26, 0 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s27, 1 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr26 @@ -193878,9 +193220,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s26, 2 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s27, 3 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr26 @@ -193890,9 +193233,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s26, 4 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s27, 5 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr26 @@ -193902,9 +193246,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s26, 6 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s27, 7 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr26 @@ -193914,9 +193259,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s26, 8 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s27, 9 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr26 @@ -193926,9 +193272,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s26, 10 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s27, 11 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr26 @@ -193938,9 +193285,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s26, 12 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s27, 13 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr26 @@ -193950,294 +193298,288 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr26 -; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s26, 14 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: v_writelane_b32 v62, s27, 15 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: ; kill: killed $sgpr26 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: s_branch .LBB95_2 ; VI-NEXT: .LBB95_4: +; VI-NEXT: v_mov_b32_e32 v14, s36 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s84 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s63 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s61 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s59 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s28 +; VI-NEXT: v_readlane_b32 s28, v62, 16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s28 +; VI-NEXT: v_readlane_b32 s28, v62, 17 +; VI-NEXT: v_mov_b32_e32 v61, s28 +; VI-NEXT: v_readlane_b32 s28, v62, 18 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s28 +; VI-NEXT: v_readlane_b32 s28, v62, 19 +; VI-NEXT: v_mov_b32_e32 v51, s28 +; VI-NEXT: v_readlane_b32 s28, v62, 20 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s28 +; VI-NEXT: v_readlane_b32 s28, v62, 21 +; VI-NEXT: v_mov_b32_e32 v52, s28 +; VI-NEXT: v_readlane_b32 s28, v62, 22 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s28 +; VI-NEXT: v_readlane_b32 s28, v62, 23 +; VI-NEXT: v_mov_b32_e32 v53, s28 +; VI-NEXT: v_readlane_b32 s28, v62, 24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s28 +; VI-NEXT: v_readlane_b32 s28, v62, 25 +; VI-NEXT: v_mov_b32_e32 v54, s28 +; VI-NEXT: v_readlane_b32 s28, v62, 26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s28 +; VI-NEXT: v_readlane_b32 s28, v62, 27 +; VI-NEXT: v_mov_b32_e32 v40, s28 +; VI-NEXT: v_readlane_b32 s28, v62, 28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s28 +; VI-NEXT: v_readlane_b32 s28, v62, 29 +; VI-NEXT: v_mov_b32_e32 v42, s28 +; VI-NEXT: v_readlane_b32 s28, v62, 30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s56 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s57 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s46 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s47 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s44 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s45 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s42 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s43 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s40 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s41 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s20 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s65 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s55 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s53 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s51 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s87 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s86 -; VI-NEXT: v_readlane_b32 s6, v62, 0 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 1 -; VI-NEXT: v_mov_b32_e32 v43, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 2 -; VI-NEXT: v_mov_b32_e32 v52, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 3 -; VI-NEXT: v_mov_b32_e32 v45, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 4 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 5 -; VI-NEXT: v_mov_b32_e32 v25, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 6 -; VI-NEXT: v_mov_b32_e32 v8, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 7 -; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 8 -; VI-NEXT: v_mov_b32_e32 v38, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 9 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 10 -; VI-NEXT: v_mov_b32_e32 v48, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 11 -; VI-NEXT: v_mov_b32_e32 v37, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 12 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 13 -; VI-NEXT: v_mov_b32_e32 v49, s6 -; VI-NEXT: v_readlane_b32 s6, v62, 14 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s15 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s84 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s83 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s82 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s81 -; VI-NEXT: v_readlane_b32 s4, v62, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 17 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 18 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 19 -; VI-NEXT: v_mov_b32_e32 v29, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 20 -; VI-NEXT: v_mov_b32_e32 v28, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 21 -; VI-NEXT: v_mov_b32_e32 v23, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 23 -; VI-NEXT: v_mov_b32_e32 v27, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 24 -; VI-NEXT: v_mov_b32_e32 v54, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 25 -; VI-NEXT: v_mov_b32_e32 v21, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 26 -; VI-NEXT: v_mov_b32_e32 v20, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 27 -; VI-NEXT: v_mov_b32_e32 v22, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 28 -; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 29 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 30 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 31 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s71 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s69 +; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s66 ; VI-NEXT: v_readlane_b32 s4, v62, 32 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v7, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 33 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v1, s65 +; VI-NEXT: v_mov_b32_e32 v23, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 34 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s64 +; VI-NEXT: v_mov_b32_e32 v30, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 35 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s55 +; VI-NEXT: v_mov_b32_e32 v25, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 36 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s54 +; VI-NEXT: v_mov_b32_e32 v22, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 37 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s53 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 38 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s52 +; VI-NEXT: v_mov_b32_e32 v11, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 39 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s51 +; VI-NEXT: v_mov_b32_e32 v29, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s50 ; VI-NEXT: v_readlane_b32 s4, v62, 40 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 41 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 42 -; VI-NEXT: v_mov_b32_e32 v10, s4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 43 -; VI-NEXT: v_mov_b32_e32 v31, s4 +; VI-NEXT: v_mov_b32_e32 v19, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 44 ; VI-NEXT: v_mov_b32_e32 v9, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 45 -; VI-NEXT: v_mov_b32_e32 v30, s4 +; VI-NEXT: v_mov_b32_e32 v31, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 46 -; VI-NEXT: v_mov_b32_e32 v33, s4 +; VI-NEXT: v_mov_b32_e32 v45, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 47 -; VI-NEXT: v_mov_b32_e32 v32, s4 +; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 48 -; VI-NEXT: v_mov_b32_e32 v12, s4 -; VI-NEXT: v_mov_b32_e32 v11, s90 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v11, s88 -; VI-NEXT: v_mov_b32_e32 v59, s14 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v58, s28 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v49, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 49 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v48, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 50 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 51 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 52 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 53 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 54 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 55 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v14, s38 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 56 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 57 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s34 -; VI-NEXT: v_mov_b32_e32 v11, s78 -; VI-NEXT: v_readlane_b32 s6, v62, 15 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v58, s26 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v51, s19 -; VI-NEXT: v_mov_b32_e32 v55, s16 -; VI-NEXT: v_mov_b32_e32 v13, s15 -; VI-NEXT: v_mov_b32_e32 v41, s12 -; VI-NEXT: v_mov_b32_e32 v53, s13 -; VI-NEXT: v_mov_b32_e32 v57, s11 -; VI-NEXT: v_mov_b32_e32 v61, s9 -; VI-NEXT: v_mov_b32_e32 v56, s7 -; VI-NEXT: v_mov_b32_e32 v7, s71 -; VI-NEXT: v_mov_b32_e32 v60, s70 -; VI-NEXT: v_mov_b32_e32 v50, s69 -; VI-NEXT: v_mov_b32_e32 v40, s68 -; VI-NEXT: v_mov_b32_e32 v35, s67 -; VI-NEXT: v_mov_b32_e32 v0, s66 -; VI-NEXT: v_mov_b32_e32 v44, s64 -; VI-NEXT: v_mov_b32_e32 v46, s54 -; VI-NEXT: v_mov_b32_e32 v47, s52 -; VI-NEXT: v_mov_b32_e32 v36, s50 -; VI-NEXT: v_mov_b32_e32 v42, s6 -; VI-NEXT: v_mov_b32_e32 v34, s85 -; VI-NEXT: v_mov_b32_e32 v26, s80 -; VI-NEXT: v_mov_b32_e32 v24, s48 -; VI-NEXT: v_mov_b32_e32 v19, s38 -; VI-NEXT: v_mov_b32_e32 v6, s36 -; VI-NEXT: v_mov_b32_e32 v3, s30 -; VI-NEXT: v_mov_b32_e32 v18, s76 -; VI-NEXT: v_mov_b32_e32 v17, s74 -; VI-NEXT: v_mov_b32_e32 v16, s72 -; VI-NEXT: v_mov_b32_e32 v15, s62 -; VI-NEXT: v_mov_b32_e32 v14, s60 -; VI-NEXT: v_mov_b32_e32 v11, s58 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v5, s30 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_readlane_b32 s4, v62, 0 +; VI-NEXT: v_mov_b32_e32 v18, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 2 +; VI-NEXT: v_mov_b32_e32 v14, s48 +; VI-NEXT: v_mov_b32_e32 v17, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 4 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v16, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 6 +; VI-NEXT: v_mov_b32_e32 v15, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 8 +; VI-NEXT: v_mov_b32_e32 v14, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 10 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_readlane_b32 s4, v62, 12 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_readlane_b32 s5, v62, 1 +; VI-NEXT: v_readlane_b32 s5, v62, 3 +; VI-NEXT: v_readlane_b32 s5, v62, 5 +; VI-NEXT: v_readlane_b32 s5, v62, 7 +; VI-NEXT: v_readlane_b32 s5, v62, 9 +; VI-NEXT: v_readlane_b32 s5, v62, 11 +; VI-NEXT: v_readlane_b32 s4, v62, 14 +; VI-NEXT: v_readlane_b32 s28, v62, 31 +; VI-NEXT: v_readlane_b32 s5, v62, 13 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v60, s85 +; VI-NEXT: v_mov_b32_e32 v50, s77 +; VI-NEXT: v_mov_b32_e32 v32, s76 +; VI-NEXT: v_mov_b32_e32 v47, s75 +; VI-NEXT: v_mov_b32_e32 v33, s74 +; VI-NEXT: v_mov_b32_e32 v34, s73 +; VI-NEXT: v_mov_b32_e32 v56, s72 +; VI-NEXT: v_mov_b32_e32 v57, s62 +; VI-NEXT: v_mov_b32_e32 v58, s60 +; VI-NEXT: v_mov_b32_e32 v59, s58 +; VI-NEXT: v_mov_b32_e32 v44, s28 +; VI-NEXT: v_mov_b32_e32 v46, s18 +; VI-NEXT: v_mov_b32_e32 v35, s16 +; VI-NEXT: v_mov_b32_e32 v36, s14 +; VI-NEXT: v_mov_b32_e32 v37, s12 +; VI-NEXT: v_mov_b32_e32 v38, s10 +; VI-NEXT: v_mov_b32_e32 v39, s8 +; VI-NEXT: v_mov_b32_e32 v43, s6 +; VI-NEXT: v_mov_b32_e32 v20, s27 +; VI-NEXT: v_mov_b32_e32 v21, s80 +; VI-NEXT: v_mov_b32_e32 v26, s70 +; VI-NEXT: v_mov_b32_e32 v28, s68 +; VI-NEXT: v_mov_b32_e32 v24, s67 +; VI-NEXT: v_mov_b32_e32 v13, s87 +; VI-NEXT: v_mov_b32_e32 v12, s86 +; VI-NEXT: v_mov_b32_e32 v10, s78 +; VI-NEXT: v_mov_b32_e32 v27, s88 +; VI-NEXT: v_mov_b32_e32 v6, s90 +; VI-NEXT: v_readlane_b32 s5, v62, 15 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: .LBB95_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v10 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v58, v2, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; VI-NEXT: v_or_b32_sdwa v29, v43, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: v_or_b32_sdwa v17, v52, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; VI-NEXT: v_or_b32_sdwa v15, v8, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; VI-NEXT: v_or_b32_sdwa v14, v38, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v34 -; VI-NEXT: v_or_b32_sdwa v8, v60, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_or_b32_sdwa v7, v61, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_readlane_b32 s87, v63, 31 ; VI-NEXT: v_readlane_b32 s86, v63, 30 ; VI-NEXT: v_readlane_b32 s85, v63, 29 @@ -194271,350 +193613,358 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v58, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v18, v39, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v31 -; VI-NEXT: v_or_b32_sdwa v18, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v18, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v29, vcc, 4, v39 -; VI-NEXT: buffer_store_dword v18, v29, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v9 -; VI-NEXT: v_or_b32_sdwa v18, v55, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 8, v39 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v30 -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v28 -; VI-NEXT: v_or_b32_sdwa v18, v45, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v46, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 12, v39 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v33 -; VI-NEXT: v_or_b32_sdwa v17, v59, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v17, vcc, 16, v39 -; VI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v32 -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v23 -; VI-NEXT: v_or_b32_sdwa v16, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v25, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v17, vcc, 20, v39 -; VI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v12 -; VI-NEXT: v_or_b32_sdwa v16, v41, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v16, vcc, 24, v39 -; VI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v11 -; VI-NEXT: v_or_b32_sdwa v13, v48, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v54 -; VI-NEXT: v_or_b32_sdwa v11, v37, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v2 -; VI-NEXT: v_or_b32_sdwa v15, v53, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v15, vcc, 28, v39 -; VI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v14, vcc, 32, v39 -; VI-NEXT: buffer_store_dword v1, v14, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v27 -; VI-NEXT: v_or_b32_sdwa v14, v4, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v14, vcc, 36, v39 -; VI-NEXT: buffer_store_dword v1, v14, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v23 +; VI-NEXT: v_or_b32_sdwa v7, v51, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v16 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v13, vcc, 40, v39 -; VI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v11, vcc, 44, v39 -; VI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; VI-NEXT: v_or_b32_sdwa v7, v52, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v15 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v25 +; VI-NEXT: v_or_b32_sdwa v7, v53, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v14 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v22 +; VI-NEXT: v_or_b32_sdwa v7, v54, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v4 -; VI-NEXT: v_or_b32_sdwa v11, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v11, vcc, 48, v39 -; VI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v21 -; VI-NEXT: v_or_b32_sdwa v11, v49, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v3 +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v11, vcc, 52, v39 -; VI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v9, vcc, 56, v39 -; VI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v20 -; VI-NEXT: v_or_b32_sdwa v9, v42, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v9, vcc, 60, v39 -; VI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v24 -; VI-NEXT: v_or_b32_sdwa v9, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v9, vcc, 64, v39 -; VI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v22 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v19 -; VI-NEXT: v_or_b32_sdwa v7, v50, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v28 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x44, v39 -; VI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v24 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x48, v39 -; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v2 -; VI-NEXT: v_or_b32_sdwa v7, v40, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x4c, v39 -; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v39 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v39 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v39 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v39 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v39 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v39 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v39 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v39 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v39 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v12 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v39 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v39 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v39 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -194631,8 +193981,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -194641,8 +193991,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v63, s30, 0 ; GFX9-NEXT: v_writelane_b32 v63, s31, 1 @@ -194825,26 +194175,30 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_writelane_b32 v62, s26, 11 ; GFX9-NEXT: s_lshr_b32 s26, s18, 8 ; GFX9-NEXT: v_writelane_b32 v62, s26, 10 -; GFX9-NEXT: s_lshr_b32 s26, s21, 24 -; GFX9-NEXT: v_writelane_b32 v62, s26, 9 -; GFX9-NEXT: s_lshr_b32 s26, s21, 16 -; GFX9-NEXT: v_writelane_b32 v62, s26, 8 -; GFX9-NEXT: s_lshr_b32 s26, s21, 8 -; GFX9-NEXT: v_writelane_b32 v62, s26, 7 -; GFX9-NEXT: s_lshr_b32 s26, s20, 16 -; GFX9-NEXT: v_writelane_b32 v62, s26, 6 -; GFX9-NEXT: s_lshr_b32 s26, s20, 8 -; GFX9-NEXT: v_writelane_b32 v62, s26, 5 -; GFX9-NEXT: s_lshr_b32 s26, s23, 24 -; GFX9-NEXT: v_writelane_b32 v62, s26, 4 -; GFX9-NEXT: s_lshr_b32 s26, s23, 16 -; GFX9-NEXT: v_writelane_b32 v62, s26, 3 -; GFX9-NEXT: s_lshr_b32 s26, s23, 8 -; GFX9-NEXT: v_writelane_b32 v62, s26, 2 -; GFX9-NEXT: s_lshr_b32 s26, s22, 16 -; GFX9-NEXT: v_writelane_b32 v62, s26, 1 -; GFX9-NEXT: s_lshr_b32 s26, s22, 8 -; GFX9-NEXT: v_writelane_b32 v62, s26, 0 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[10:11], 24 +; GFX9-NEXT: v_writelane_b32 v62, s78, 8 +; GFX9-NEXT: v_writelane_b32 v62, s79, 9 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 +; GFX9-NEXT: v_writelane_b32 v62, s78, 6 +; GFX9-NEXT: v_writelane_b32 v62, s79, 7 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[14:15], 24 +; GFX9-NEXT: v_writelane_b32 v62, s78, 4 +; GFX9-NEXT: v_writelane_b32 v62, s79, 5 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[16:17], 24 +; GFX9-NEXT: v_writelane_b32 v62, s78, 2 +; GFX9-NEXT: v_writelane_b32 v62, s79, 3 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[18:19], 24 +; GFX9-NEXT: v_writelane_b32 v62, s78, 0 +; GFX9-NEXT: s_lshr_b32 s64, s21, 24 +; GFX9-NEXT: s_lshr_b32 s65, s21, 16 +; GFX9-NEXT: s_lshr_b32 s67, s21, 8 +; GFX9-NEXT: s_lshr_b32 s66, s20, 16 +; GFX9-NEXT: s_lshr_b32 s68, s20, 8 +; GFX9-NEXT: s_lshr_b32 s69, s23, 24 +; GFX9-NEXT: s_lshr_b32 s70, s23, 16 +; GFX9-NEXT: s_lshr_b32 s80, s23, 8 +; GFX9-NEXT: s_lshr_b32 s71, s22, 16 +; GFX9-NEXT: s_lshr_b32 s81, s22, 8 ; GFX9-NEXT: s_lshr_b32 s82, s25, 24 ; GFX9-NEXT: s_lshr_b32 s83, s25, 16 ; GFX9-NEXT: s_lshr_b32 s85, s25, 8 @@ -194865,328 +194219,315 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: s_lshr_b32 s54, s45, 8 ; GFX9-NEXT: s_lshr_b32 s53, s44, 16 ; GFX9-NEXT: s_lshr_b32 s55, s44, 8 -; GFX9-NEXT: s_lshr_b32 s64, s47, 24 -; GFX9-NEXT: s_lshr_b32 s65, s47, 16 -; GFX9-NEXT: s_lshr_b32 s67, s47, 8 -; GFX9-NEXT: s_lshr_b32 s66, s46, 16 -; GFX9-NEXT: s_lshr_b32 s68, s46, 8 -; GFX9-NEXT: s_lshr_b32 s69, s57, 24 -; GFX9-NEXT: s_lshr_b32 s70, s57, 16 -; GFX9-NEXT: s_lshr_b32 s80, s57, 8 -; GFX9-NEXT: s_lshr_b32 s71, s56, 16 -; GFX9-NEXT: s_lshr_b32 s81, s56, 8 -; GFX9-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 -; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 -; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 +; GFX9-NEXT: s_lshr_b32 s26, s47, 24 +; GFX9-NEXT: s_lshr_b32 s27, s47, 16 +; GFX9-NEXT: s_lshr_b32 s29, s47, 8 +; GFX9-NEXT: s_lshr_b32 s28, s46, 16 +; GFX9-NEXT: s_lshr_b32 s58, s46, 8 +; GFX9-NEXT: s_lshr_b32 s59, s57, 24 +; GFX9-NEXT: s_lshr_b32 s60, s57, 16 +; GFX9-NEXT: s_lshr_b32 s62, s57, 8 +; GFX9-NEXT: s_lshr_b32 s61, s56, 16 +; GFX9-NEXT: s_lshr_b32 s63, s56, 8 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[8:9], 24 +; GFX9-NEXT: v_writelane_b32 v62, s79, 1 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[56:57], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB95_4 ; GFX9-NEXT: .LBB95_2: ; %cmp.true -; GFX9-NEXT: v_mov_b32_e32 v15, 0x200 -; GFX9-NEXT: v_pk_add_f16 v26, s5, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v25, s4, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v22, s57, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v21, s56, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, s47, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, s46, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, s45, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, s44, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, s43, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v9, s42, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, s41, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, s40, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, s25, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, s24, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, s23, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, s22, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, s21, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, s20, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v49, s19, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v48, s18, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v38, s17, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v37, s16, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v36, s15, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v35, s14, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v34, s13, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v33, s12, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v32, s11, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v31, s10, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v30, s9, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v29, s8, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v28, s7, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v27, s6, v15 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v17, 0x200 +; GFX9-NEXT: v_pk_add_f16 v16, s57, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, s56, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, s47, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s46, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, s45, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, s44, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, s43, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, s42, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, s41, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, s40, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, s25, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, s24, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, s23, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, s22, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, s21, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, s20, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, s19, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v31, s18, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, s17, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, s16, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, s15, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, s14, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, s13, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, s12, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, s11, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, s10, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, s9, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, s8, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, s7, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, s6, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, s5, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, s4, v17 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[33:34] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[35:36] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[37:38] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[48:49] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3 -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6 -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8 -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v7 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v10 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v28 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v27 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v30 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v12 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v29 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v12 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v32 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v32 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v11 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v31 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v14 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v34 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v14 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v33 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v36 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v13 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v36 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v22 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v35 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v22 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v22 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v38 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v36 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v38 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v4 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v26 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v2 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v25 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v28 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v1 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v28 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v4 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v28 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v4 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v27 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v30 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v30 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v29 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v32 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v32 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v32 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v7 +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v3 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v13 +; GFX9-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v31 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v12 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v15 ; GFX9-NEXT: s_branch .LBB95_5 ; GFX9-NEXT: .LBB95_3: ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr81 -; GFX9-NEXT: ; implicit-def: $sgpr71 -; GFX9-NEXT: ; implicit-def: $sgpr80 -; GFX9-NEXT: ; implicit-def: $sgpr70 -; GFX9-NEXT: ; implicit-def: $sgpr69 -; GFX9-NEXT: ; implicit-def: $sgpr68 -; GFX9-NEXT: ; implicit-def: $sgpr66 -; GFX9-NEXT: ; implicit-def: $sgpr67 -; GFX9-NEXT: ; implicit-def: $sgpr65 -; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr29 ; GFX9-NEXT: ; implicit-def: $sgpr55 ; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr90 ; GFX9-NEXT: ; implicit-def: $sgpr54 ; GFX9-NEXT: ; implicit-def: $sgpr52 ; GFX9-NEXT: ; implicit-def: $sgpr51 ; GFX9-NEXT: ; implicit-def: $sgpr50 ; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr49 ; GFX9-NEXT: ; implicit-def: $sgpr39 ; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr99 ; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr94 ; GFX9-NEXT: ; implicit-def: $sgpr98 ; GFX9-NEXT: ; implicit-def: $sgpr96 ; GFX9-NEXT: ; implicit-def: $sgpr87 ; GFX9-NEXT: ; implicit-def: $sgpr86 ; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr85 ; GFX9-NEXT: ; implicit-def: $sgpr83 ; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: ; implicit-def: $sgpr76 -; GFX9-NEXT: ; implicit-def: $sgpr74 -; GFX9-NEXT: ; implicit-def: $sgpr72 -; GFX9-NEXT: ; implicit-def: $sgpr62 -; GFX9-NEXT: ; implicit-def: $sgpr60 -; GFX9-NEXT: ; implicit-def: $sgpr58 -; GFX9-NEXT: ; implicit-def: $sgpr28 -; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr71 ; GFX9-NEXT: ; implicit-def: $sgpr34 -; GFX9-NEXT: ; implicit-def: $sgpr30 -; GFX9-NEXT: ; implicit-def: $sgpr94 -; GFX9-NEXT: ; implicit-def: $sgpr92 -; GFX9-NEXT: ; implicit-def: $sgpr90 -; GFX9-NEXT: ; implicit-def: $sgpr88 -; GFX9-NEXT: ; implicit-def: $sgpr78 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr64 ; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s26, 0 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s27, 1 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 @@ -195196,9 +194537,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s26, 2 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s27, 3 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 @@ -195208,9 +194550,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s26, 4 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s27, 5 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 @@ -195220,9 +194563,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s26, 6 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s27, 7 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 @@ -195232,11 +194576,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s26, 8 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s27, 9 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 @@ -195272,254 +194618,233 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: s_branch .LBB95_2 ; GFX9-NEXT: .LBB95_4: -; GFX9-NEXT: v_mov_b32_e32 v15, s71 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s80 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s70 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s69 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s68 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s66 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s67 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s65 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s64 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s55 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s53 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s54 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s52 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s51 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s50 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s48 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s49 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s39 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s38 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s99 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s97 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s98 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s96 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s87 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s86 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s84 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s85 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s83 -; GFX9-NEXT: v_mov_b32_e32 v25, s4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s82 -; GFX9-NEXT: v_readlane_b32 s4, v62, 0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 1 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 2 -; GFX9-NEXT: v_mov_b32_e32 v19, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 3 -; GFX9-NEXT: v_mov_b32_e32 v55, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 5 -; GFX9-NEXT: v_mov_b32_e32 v53, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 6 -; GFX9-NEXT: v_mov_b32_e32 v52, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 7 -; GFX9-NEXT: v_mov_b32_e32 v51, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 8 -; GFX9-NEXT: v_mov_b32_e32 v50, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 9 -; GFX9-NEXT: v_mov_b32_e32 v24, s4 +; GFX9-NEXT: v_mov_b32_e32 v38, s51 +; GFX9-NEXT: v_mov_b32_e32 v37, s76 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, s74 +; GFX9-NEXT: v_mov_b32_e32 v17, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 10 -; GFX9-NEXT: v_mov_b32_e32 v20, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 11 ; GFX9-NEXT: v_mov_b32_e32 v42, s4 +; GFX9-NEXT: v_mov_b32_e32 v41, s94 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s62 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s59 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s58 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s53 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s39 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s38 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s99 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s97 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s98 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s96 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s87 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s86 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s84 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s85 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s83 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s82 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s81 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s71 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s80 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s70 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s69 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s68 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s66 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s67 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s65 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s64 +; GFX9-NEXT: v_readlane_b32 s4, v62, 11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 12 -; GFX9-NEXT: v_mov_b32_e32 v18, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 13 -; GFX9-NEXT: v_mov_b32_e32 v39, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 14 -; GFX9-NEXT: v_mov_b32_e32 v43, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 15 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 16 -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 17 -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, s72 +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 18 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 19 -; GFX9-NEXT: v_mov_b32_e32 v54, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 20 -; GFX9-NEXT: v_mov_b32_e32 v61, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 21 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 22 -; GFX9-NEXT: v_mov_b32_e32 v60, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 23 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 24 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 25 -; GFX9-NEXT: v_mov_b32_e32 v23, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 26 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 27 -; GFX9-NEXT: v_mov_b32_e32 v59, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 28 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_mov_b32_e32 v61, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 29 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 30 -; GFX9-NEXT: v_mov_b32_e32 v58, s4 +; GFX9-NEXT: v_mov_b32_e32 v60, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 31 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_mov_b32_e32 v59, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 32 -; GFX9-NEXT: v_mov_b32_e32 v57, s4 +; GFX9-NEXT: v_mov_b32_e32 v53, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 33 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 34 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_mov_b32_e32 v58, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 35 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_mov_b32_e32 v57, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 36 -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 37 -; GFX9-NEXT: v_mov_b32_e32 v56, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 38 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 39 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 40 -; GFX9-NEXT: v_mov_b32_e32 v47, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 41 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 42 -; GFX9-NEXT: v_mov_b32_e32 v46, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 43 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 44 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 45 -; GFX9-NEXT: v_mov_b32_e32 v45, s4 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 46 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_mov_b32_e32 v56, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 47 -; GFX9-NEXT: v_mov_b32_e32 v44, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 48 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_mov_b32_e32 v46, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 49 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s26 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s28 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s58 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s60 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s62 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s72 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s74 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s76 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v47, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 8 +; GFX9-NEXT: v_mov_b32_e32 v41, s92 +; GFX9-NEXT: v_mov_b32_e32 v37, s4 +; GFX9-NEXT: v_mov_b32_e32 v34, s34 +; GFX9-NEXT: v_mov_b32_e32 v55, s30 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s78 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s88 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_readlane_b32 s4, v62, 6 ; GFX9-NEXT: v_mov_b32_e32 v41, s90 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s92 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s94 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s30 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s34 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s36 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, s4 +; GFX9-NEXT: v_mov_b32_e32 v41, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, s88 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s56 -; GFX9-NEXT: v_mov_b32_e32 v22, s57 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_readlane_b32 s4, v62, 4 +; GFX9-NEXT: v_mov_b32_e32 v56, s78 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, s4 +; GFX9-NEXT: v_mov_b32_e32 v18, s5 +; GFX9-NEXT: v_readlane_b32 s5, v62, 9 +; GFX9-NEXT: v_readlane_b32 s5, v62, 7 +; GFX9-NEXT: v_readlane_b32 s5, v62, 5 +; GFX9-NEXT: v_readlane_b32 s4, v62, 2 +; GFX9-NEXT: v_readlane_b32 s5, v62, 3 +; GFX9-NEXT: v_mov_b32_e32 v43, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 0 +; GFX9-NEXT: v_mov_b32_e32 v15, s56 +; GFX9-NEXT: v_mov_b32_e32 v16, s57 ; GFX9-NEXT: v_mov_b32_e32 v13, s46 ; GFX9-NEXT: v_mov_b32_e32 v14, s47 ; GFX9-NEXT: v_mov_b32_e32 v11, s44 @@ -195532,65 +194857,55 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v6, s25 ; GFX9-NEXT: v_mov_b32_e32 v3, s22 ; GFX9-NEXT: v_mov_b32_e32 v4, s23 -; GFX9-NEXT: v_mov_b32_e32 v1, s20 -; GFX9-NEXT: v_mov_b32_e32 v2, s21 -; GFX9-NEXT: v_mov_b32_e32 v48, s18 -; GFX9-NEXT: v_mov_b32_e32 v49, s19 -; GFX9-NEXT: v_mov_b32_e32 v37, s16 -; GFX9-NEXT: v_mov_b32_e32 v38, s17 -; GFX9-NEXT: v_mov_b32_e32 v35, s14 -; GFX9-NEXT: v_mov_b32_e32 v36, s15 -; GFX9-NEXT: v_mov_b32_e32 v33, s12 -; GFX9-NEXT: v_mov_b32_e32 v34, s13 -; GFX9-NEXT: v_mov_b32_e32 v31, s10 -; GFX9-NEXT: v_mov_b32_e32 v32, s11 -; GFX9-NEXT: v_mov_b32_e32 v29, s8 -; GFX9-NEXT: v_mov_b32_e32 v30, s9 -; GFX9-NEXT: v_mov_b32_e32 v27, s6 -; GFX9-NEXT: v_mov_b32_e32 v28, s7 -; GFX9-NEXT: v_mov_b32_e32 v26, s5 -; GFX9-NEXT: v_mov_b32_e32 v41, v50 -; GFX9-NEXT: v_mov_b32_e32 v50, v51 -; GFX9-NEXT: v_mov_b32_e32 v51, v52 -; GFX9-NEXT: v_mov_b32_e32 v52, v53 -; GFX9-NEXT: v_mov_b32_e32 v53, v55 -; GFX9-NEXT: v_mov_b32_e32 v55, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, s81 -; GFX9-NEXT: .LBB95_5: ; %end -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v15 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GFX9-NEXT: v_or_b32_sdwa v16, v37, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v21, v21, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v35, v35, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v36, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GFX9-NEXT: v_or_b32_sdwa v23, v33, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v58 -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v57 -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GFX9-NEXT: v_or_b32_sdwa v17, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v56 -; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v47 -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v46 -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v44 -; GFX9-NEXT: v_or_b32_sdwa v26, v26, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-NEXT: v_mov_b32_e32 v2, s21 +; GFX9-NEXT: v_mov_b32_e32 v31, s18 +; GFX9-NEXT: v_mov_b32_e32 v32, s19 +; GFX9-NEXT: v_mov_b32_e32 v29, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s17 +; GFX9-NEXT: v_mov_b32_e32 v27, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s15 +; GFX9-NEXT: v_mov_b32_e32 v25, s12 +; GFX9-NEXT: v_mov_b32_e32 v26, s13 +; GFX9-NEXT: v_mov_b32_e32 v23, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s11 +; GFX9-NEXT: v_mov_b32_e32 v21, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s9 +; GFX9-NEXT: v_mov_b32_e32 v19, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s7 +; GFX9-NEXT: v_mov_b32_e32 v51, s26 +; GFX9-NEXT: v_readlane_b32 s5, v62, 1 +; GFX9-NEXT: v_mov_b32_e32 v44, s4 +; GFX9-NEXT: v_mov_b32_e32 v36, s48 +; GFX9-NEXT: v_mov_b32_e32 v40, s50 +; GFX9-NEXT: v_mov_b32_e32 v48, s52 +; GFX9-NEXT: v_mov_b32_e32 v49, s54 +; GFX9-NEXT: v_mov_b32_e32 v34, s55 +; GFX9-NEXT: v_mov_b32_e32 v39, s27 +; GFX9-NEXT: v_mov_b32_e32 v54, s29 +; GFX9-NEXT: v_mov_b32_e32 v50, s28 +; GFX9-NEXT: v_mov_b32_e32 v52, s60 +; GFX9-NEXT: v_mov_b32_e32 v55, s61 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, s36 +; GFX9-NEXT: v_mov_b32_e32 v45, s63 +; GFX9-NEXT: v_mov_b32_e32 v56, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v33 +; GFX9-NEXT: v_mov_b32_e32 v33, v53 +; GFX9-NEXT: v_mov_b32_e32 v53, v59 +; GFX9-NEXT: v_mov_b32_e32 v59, v60 +; GFX9-NEXT: v_mov_b32_e32 v60, v61 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, s49 +; GFX9-NEXT: .LBB95_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v42 +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v44 ; GFX9-NEXT: v_readlane_b32 s99, v63, 35 ; GFX9-NEXT: v_readlane_b32 s98, v63, 34 ; GFX9-NEXT: v_readlane_b32 s97, v63, 33 @@ -195627,329 +194942,343 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_readlane_b32 s34, v63, 2 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v15, v38, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GFX9-NEXT: v_or_b32_sdwa v22, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v19, v42, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v43 -; GFX9-NEXT: v_or_b32_sdwa v19, v39, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v40, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v54 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v44, v61, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v36, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v32, v44, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v43 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v26, v60, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v25, v53, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v24, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v28, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v35, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v18, v41, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v55, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v52, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v50, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v54 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v51 +; GFX9-NEXT: v_or_b32_sdwa v14, v39, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v49 ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v12, v48, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v37 ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v55 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v53, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v52 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24 -; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload @@ -195967,8 +195296,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -195978,257 +195307,257 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 ; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:88 ; GFX11-NEXT: s_mov_b32 exec_lo, s4 -; GFX11-NEXT: v_writelane_b32 v75, s30, 0 -; GFX11-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-NEXT: v_writelane_b32 v73, s30, 0 +; GFX11-NEXT: v_writelane_b32 v74, s96, 0 ; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 -; GFX11-NEXT: v_writelane_b32 v75, s31, 1 -; GFX11-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-NEXT: v_writelane_b32 v73, s31, 1 +; GFX11-NEXT: v_writelane_b32 v74, s97, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_readfirstlane_b32 s40, v16 ; GFX11-NEXT: v_readfirstlane_b32 s41, v17 ; GFX11-NEXT: v_readfirstlane_b32 s28, v1 -; GFX11-NEXT: v_writelane_b32 v75, s34, 2 -; GFX11-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-NEXT: v_writelane_b32 v73, s34, 2 +; GFX11-NEXT: v_writelane_b32 v74, s98, 2 ; GFX11-NEXT: v_readfirstlane_b32 s29, v2 ; GFX11-NEXT: v_readfirstlane_b32 s14, v3 ; GFX11-NEXT: v_readfirstlane_b32 s15, v4 -; GFX11-NEXT: v_writelane_b32 v75, s35, 3 -; GFX11-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-NEXT: v_writelane_b32 v73, s35, 3 +; GFX11-NEXT: v_writelane_b32 v74, s99, 3 ; GFX11-NEXT: v_readfirstlane_b32 s12, v5 ; GFX11-NEXT: v_readfirstlane_b32 s13, v6 ; GFX11-NEXT: v_readfirstlane_b32 s10, v7 -; GFX11-NEXT: v_writelane_b32 v75, s36, 4 -; GFX11-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-NEXT: v_writelane_b32 v73, s36, 4 +; GFX11-NEXT: v_writelane_b32 v74, s100, 4 ; GFX11-NEXT: v_readfirstlane_b32 s11, v8 ; GFX11-NEXT: v_readfirstlane_b32 s8, v9 ; GFX11-NEXT: v_readfirstlane_b32 s9, v10 -; GFX11-NEXT: v_writelane_b32 v75, s37, 5 -; GFX11-NEXT: v_writelane_b32 v76, s101, 5 +; GFX11-NEXT: v_writelane_b32 v73, s37, 5 +; GFX11-NEXT: v_writelane_b32 v74, s101, 5 ; GFX11-NEXT: v_readfirstlane_b32 s6, v11 ; GFX11-NEXT: v_readfirstlane_b32 s7, v12 ; GFX11-NEXT: v_readfirstlane_b32 s4, v13 -; GFX11-NEXT: v_writelane_b32 v75, s38, 6 -; GFX11-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-NEXT: v_writelane_b32 v73, s38, 6 +; GFX11-NEXT: v_writelane_b32 v74, s102, 6 ; GFX11-NEXT: v_readfirstlane_b32 s5, v14 ; GFX11-NEXT: s_mov_b32 s99, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: v_writelane_b32 v75, s39, 7 -; GFX11-NEXT: v_writelane_b32 v76, s103, 7 -; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 -; GFX11-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane -; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane -; GFX11-NEXT: v_writelane_b32 v75, s48, 8 -; GFX11-NEXT: v_writelane_b32 v76, s104, 8 -; GFX11-NEXT: v_writelane_b32 v75, s49, 9 -; GFX11-NEXT: v_writelane_b32 v75, s50, 10 -; GFX11-NEXT: v_writelane_b32 v75, s51, 11 -; GFX11-NEXT: v_writelane_b32 v75, s52, 12 -; GFX11-NEXT: v_writelane_b32 v75, s53, 13 -; GFX11-NEXT: v_writelane_b32 v75, s54, 14 -; GFX11-NEXT: v_writelane_b32 v75, s55, 15 -; GFX11-NEXT: v_writelane_b32 v75, s64, 16 -; GFX11-NEXT: v_writelane_b32 v75, s65, 17 -; GFX11-NEXT: v_writelane_b32 v75, s66, 18 -; GFX11-NEXT: v_writelane_b32 v75, s67, 19 -; GFX11-NEXT: v_writelane_b32 v75, s68, 20 -; GFX11-NEXT: v_writelane_b32 v75, s69, 21 -; GFX11-NEXT: v_writelane_b32 v75, s70, 22 -; GFX11-NEXT: v_writelane_b32 v75, s71, 23 -; GFX11-NEXT: v_writelane_b32 v75, s80, 24 -; GFX11-NEXT: v_writelane_b32 v75, s81, 25 -; GFX11-NEXT: v_writelane_b32 v75, s82, 26 -; GFX11-NEXT: v_writelane_b32 v75, s83, 27 -; GFX11-NEXT: v_writelane_b32 v75, s84, 28 -; GFX11-NEXT: v_writelane_b32 v75, s85, 29 -; GFX11-NEXT: v_writelane_b32 v75, s86, 30 -; GFX11-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-NEXT: v_writelane_b32 v73, s39, 7 +; GFX11-NEXT: v_writelane_b32 v74, s103, 7 +; GFX11-NEXT: s_clause 0x10 ; 68-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 +; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane +; GFX11-NEXT: ; implicit-def: $vgpr75 : SGPR spill to VGPR lane +; GFX11-NEXT: v_writelane_b32 v73, s48, 8 +; GFX11-NEXT: v_writelane_b32 v74, s104, 8 +; GFX11-NEXT: v_writelane_b32 v73, s49, 9 +; GFX11-NEXT: v_writelane_b32 v73, s50, 10 +; GFX11-NEXT: v_writelane_b32 v73, s51, 11 +; GFX11-NEXT: v_writelane_b32 v73, s52, 12 +; GFX11-NEXT: v_writelane_b32 v73, s53, 13 +; GFX11-NEXT: v_writelane_b32 v73, s54, 14 +; GFX11-NEXT: v_writelane_b32 v73, s55, 15 +; GFX11-NEXT: v_writelane_b32 v73, s64, 16 +; GFX11-NEXT: v_writelane_b32 v73, s65, 17 +; GFX11-NEXT: v_writelane_b32 v73, s66, 18 +; GFX11-NEXT: v_writelane_b32 v73, s67, 19 +; GFX11-NEXT: v_writelane_b32 v73, s68, 20 +; GFX11-NEXT: v_writelane_b32 v73, s69, 21 +; GFX11-NEXT: v_writelane_b32 v73, s70, 22 +; GFX11-NEXT: v_writelane_b32 v73, s71, 23 +; GFX11-NEXT: v_writelane_b32 v73, s80, 24 +; GFX11-NEXT: v_writelane_b32 v73, s81, 25 +; GFX11-NEXT: v_writelane_b32 v73, s82, 26 +; GFX11-NEXT: v_writelane_b32 v73, s83, 27 +; GFX11-NEXT: v_writelane_b32 v73, s84, 28 +; GFX11-NEXT: v_writelane_b32 v73, s85, 29 +; GFX11-NEXT: v_writelane_b32 v73, s86, 30 +; GFX11-NEXT: v_writelane_b32 v73, s87, 31 ; GFX11-NEXT: s_cbranch_scc0 .LBB95_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-NEXT: s_lshr_b64 s[74:75], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-NEXT: v_writelane_b32 v75, s42, 8 ; GFX11-NEXT: s_lshr_b32 s42, s27, 8 ; GFX11-NEXT: s_lshr_b32 s43, s27, 24 ; GFX11-NEXT: s_lshr_b32 s34, s5, 24 ; GFX11-NEXT: s_lshr_b32 s35, s5, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-NEXT: v_writelane_b32 v75, s42, 7 ; GFX11-NEXT: s_lshr_b32 s42, s26, 16 ; GFX11-NEXT: s_lshr_b32 s37, s5, 8 ; GFX11-NEXT: s_lshr_b32 s36, s4, 16 ; GFX11-NEXT: s_lshr_b32 s38, s4, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-NEXT: v_writelane_b32 v75, s42, 6 ; GFX11-NEXT: s_lshr_b32 s42, s26, 8 ; GFX11-NEXT: s_lshr_b32 s39, s7, 24 ; GFX11-NEXT: s_lshr_b32 s48, s7, 16 ; GFX11-NEXT: s_lshr_b32 s50, s7, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-NEXT: v_writelane_b32 v75, s42, 5 ; GFX11-NEXT: s_lshr_b32 s42, s25, 24 ; GFX11-NEXT: s_lshr_b32 s49, s6, 16 ; GFX11-NEXT: s_lshr_b32 s51, s6, 8 ; GFX11-NEXT: s_lshr_b32 s52, s9, 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-NEXT: v_writelane_b32 v75, s42, 4 ; GFX11-NEXT: s_lshr_b32 s42, s25, 16 ; GFX11-NEXT: s_lshr_b32 s53, s9, 16 ; GFX11-NEXT: s_lshr_b32 s55, s9, 8 ; GFX11-NEXT: s_lshr_b32 s54, s8, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-NEXT: v_writelane_b32 v75, s42, 3 ; GFX11-NEXT: s_lshr_b32 s42, s25, 8 ; GFX11-NEXT: s_lshr_b32 s64, s8, 8 ; GFX11-NEXT: s_lshr_b32 s65, s11, 24 ; GFX11-NEXT: s_lshr_b32 s66, s11, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-NEXT: v_writelane_b32 v75, s42, 2 ; GFX11-NEXT: s_lshr_b32 s42, s24, 16 ; GFX11-NEXT: s_lshr_b32 s68, s11, 8 ; GFX11-NEXT: s_lshr_b32 s67, s10, 16 ; GFX11-NEXT: s_lshr_b32 s69, s10, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-NEXT: v_writelane_b32 v75, s42, 1 ; GFX11-NEXT: s_lshr_b32 s42, s24, 8 ; GFX11-NEXT: s_lshr_b32 s70, s13, 24 ; GFX11-NEXT: s_lshr_b32 s71, s13, 16 ; GFX11-NEXT: s_lshr_b32 s81, s13, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-NEXT: v_writelane_b32 v75, s42, 0 ; GFX11-NEXT: s_lshr_b32 s42, s23, 24 ; GFX11-NEXT: s_lshr_b32 s80, s12, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-NEXT: v_writelane_b32 v76, s42, 31 ; GFX11-NEXT: s_lshr_b32 s42, s23, 16 ; GFX11-NEXT: s_lshr_b32 s82, s12, 8 ; GFX11-NEXT: s_lshr_b32 s83, s15, 24 ; GFX11-NEXT: s_lshr_b32 s84, s15, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-NEXT: v_writelane_b32 v76, s42, 30 ; GFX11-NEXT: s_lshr_b32 s42, s23, 8 ; GFX11-NEXT: s_lshr_b32 s86, s15, 8 ; GFX11-NEXT: s_lshr_b32 s85, s14, 16 ; GFX11-NEXT: s_lshr_b32 s87, s14, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-NEXT: v_writelane_b32 v76, s42, 29 ; GFX11-NEXT: s_lshr_b32 s42, s22, 16 ; GFX11-NEXT: s_lshr_b32 s96, s29, 24 ; GFX11-NEXT: s_lshr_b32 s97, s29, 16 ; GFX11-NEXT: s_lshr_b32 s100, s29, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-NEXT: v_writelane_b32 v76, s42, 28 ; GFX11-NEXT: s_lshr_b32 s42, s22, 8 ; GFX11-NEXT: s_lshr_b32 s98, s28, 16 ; GFX11-NEXT: s_lshr_b32 s101, s28, 8 ; GFX11-NEXT: s_lshr_b32 s102, s41, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-NEXT: v_writelane_b32 v76, s42, 27 ; GFX11-NEXT: s_lshr_b32 s42, s21, 24 ; GFX11-NEXT: s_lshr_b32 s103, s41, 16 ; GFX11-NEXT: s_lshr_b32 vcc_hi, s41, 8 ; GFX11-NEXT: s_lshr_b32 s104, s40, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-NEXT: v_writelane_b32 v76, s42, 26 ; GFX11-NEXT: s_lshr_b32 s42, s21, 16 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[26:27], 24 -; GFX11-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[26:27], 24 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[24:25], 24 ; GFX11-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-NEXT: v_writelane_b32 v76, s42, 25 ; GFX11-NEXT: s_lshr_b32 s42, s21, 8 ; GFX11-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 ; GFX11-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 ; GFX11-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-NEXT: v_writelane_b32 v76, s42, 24 ; GFX11-NEXT: s_lshr_b32 s42, s20, 16 ; GFX11-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[6:7], 24 +; GFX11-NEXT: v_writelane_b32 v76, s42, 23 ; GFX11-NEXT: s_lshr_b32 s42, s20, 8 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[8:9], 24 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[12:13], 24 +; GFX11-NEXT: v_writelane_b32 v76, s42, 22 ; GFX11-NEXT: s_lshr_b32 s42, s19, 24 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[28:29], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[14:15], 24 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[28:29], 24 +; GFX11-NEXT: v_writelane_b32 v76, s42, 21 ; GFX11-NEXT: s_lshr_b32 s42, s19, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-NEXT: v_writelane_b32 v76, s42, 20 ; GFX11-NEXT: s_lshr_b32 s42, s19, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-NEXT: v_writelane_b32 v76, s42, 19 ; GFX11-NEXT: s_lshr_b32 s42, s18, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-NEXT: v_writelane_b32 v76, s42, 18 ; GFX11-NEXT: s_lshr_b32 s42, s18, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-NEXT: v_writelane_b32 v76, s42, 17 ; GFX11-NEXT: s_lshr_b32 s42, s17, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-NEXT: v_writelane_b32 v76, s42, 16 ; GFX11-NEXT: s_lshr_b32 s42, s17, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-NEXT: v_writelane_b32 v76, s42, 15 ; GFX11-NEXT: s_lshr_b32 s42, s17, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-NEXT: v_writelane_b32 v76, s42, 14 ; GFX11-NEXT: s_lshr_b32 s42, s16, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-NEXT: v_writelane_b32 v76, s42, 13 ; GFX11-NEXT: s_lshr_b32 s42, s16, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-NEXT: v_writelane_b32 v76, s42, 12 ; GFX11-NEXT: s_lshr_b32 s42, s3, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-NEXT: v_writelane_b32 v76, s42, 11 ; GFX11-NEXT: s_lshr_b32 s42, s3, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-NEXT: v_writelane_b32 v76, s42, 10 ; GFX11-NEXT: s_lshr_b32 s42, s3, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-NEXT: v_writelane_b32 v76, s42, 9 ; GFX11-NEXT: s_lshr_b32 s42, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-NEXT: v_writelane_b32 v76, s42, 8 ; GFX11-NEXT: s_lshr_b32 s42, s2, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-NEXT: v_writelane_b32 v76, s42, 7 ; GFX11-NEXT: s_lshr_b32 s42, s1, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-NEXT: v_writelane_b32 v76, s42, 6 ; GFX11-NEXT: s_lshr_b32 s42, s1, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-NEXT: v_writelane_b32 v76, s42, 5 ; GFX11-NEXT: s_lshr_b32 s42, s1, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-NEXT: v_writelane_b32 v76, s42, 4 ; GFX11-NEXT: s_lshr_b32 s42, s0, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 3 +; GFX11-NEXT: v_writelane_b32 v76, s42, 3 ; GFX11-NEXT: s_lshr_b32 s42, s0, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-NEXT: v_writelane_b32 v76, s42, 2 ; GFX11-NEXT: s_lshr_b32 s42, s40, 8 -; GFX11-NEXT: v_writelane_b32 v78, s74, 0 -; GFX11-NEXT: v_writelane_b32 v78, s75, 1 -; GFX11-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; GFX11-NEXT: v_writelane_b32 v76, s74, 0 +; GFX11-NEXT: v_writelane_b32 v76, s75, 1 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s99 ; GFX11-NEXT: s_cbranch_vccnz .LBB95_4 ; GFX11-NEXT: .LBB95_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v39, 0x200, s17 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v38, 0x200, s16 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v51, 0x200, s3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v50, 0x200, s2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v33, 0x200, s21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, s24 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v29, 0x200, s23 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v28, 0x200, s22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v33, 0x200, s21 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v32, 0x200, s20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, s5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, s4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v53, 0x200, s3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v37, 0x200, s19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v36, 0x200, s18 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v16, 0x200, s41 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, s40 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, s29 op_sel_hi:[0,1] @@ -196243,110 +195572,108 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v3, 0x200, s6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v53, 0x200, s1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v52, 0x200, s0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v37, 0x200, s19 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v25, 0x200, s25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v55, 0x200, s1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v54, 0x200, s0 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v39, 0x200, s17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v38, 0x200, s16 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v21, 0x200, s27 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v20, 0x200, s26 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v24, 0x200, s24 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v36, 0x200, s18 op_sel_hi:[0,1] -; GFX11-NEXT: v_lshrrev_b64 v[80:81], 24, v[38:39] -; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[28:29] -; GFX11-NEXT: v_lshrrev_b64 v[67:68], 24, v[32:33] -; GFX11-NEXT: v_lshrrev_b64 v[81:82], 24, v[50:51] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[24:25] +; GFX11-NEXT: v_lshrrev_b64 v[67:68], 24, v[28:29] +; GFX11-NEXT: v_lshrrev_b64 v[68:69], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[69:70], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53] ; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] -; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[20:21] -; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[24:25] -; GFX11-NEXT: v_lshrrev_b64 v[70:71], 24, v[36:37] -; GFX11-NEXT: v_lshrrev_b64 v[82:83], 24, v[52:53] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[20:21] +; GFX11-NEXT: v_lshrrev_b64 v[70:71], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b64 v[81:82], 24, v[54:55] ; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] ; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX11-NEXT: v_lshrrev_b64 v[26:27], 24, v[7:8] ; GFX11-NEXT: v_lshrrev_b64 v[30:31], 24, v[9:10] ; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12] -; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[13:14] -; GFX11-NEXT: v_lshrrev_b64 v[68:69], 24, v[15:16] -; GFX11-NEXT: v_lshrrev_b32_e32 v147, 24, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v148, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v149, 8, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v150, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v151, 8, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v161, 24, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v160, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v162, 8, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v163, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v164, 8, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v165, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v167, 8, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v177, 8, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v179, 24, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v178, 16, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v180, 8, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v182, 8, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v183, 16, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v41, 8, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v42, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v43, 8, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v45, 24, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v44, 16, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v46, 8, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v56, 8, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v51 -; GFX11-NEXT: v_lshrrev_b32_e32 v57, 16, v51 -; GFX11-NEXT: v_lshrrev_b32_e32 v59, 8, v51 -; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v60, 8, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v62, 16, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v72, 8, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v73, 16, v52 -; GFX11-NEXT: v_lshrrev_b32_e32 v74, 8, v52 +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 24, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 24, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 24, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 24, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 16, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 8, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 24, v53 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v53 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v53 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 8, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 24, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 16, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 16, v54 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 8, v54 ; GFX11-NEXT: v_lshrrev_b32_e32 v19, 24, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v97, 8, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v98, 24, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v99, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v100, 8, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v102, 8, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v103, 24, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v112, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v113, 8, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v114, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v115, 8, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v116, 24, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v117, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v118, 8, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v128, 8, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v129, 24, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v130, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v131, 8, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v133, 8, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v134, 24, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v146, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v15 ; GFX11-NEXT: s_branch .LBB95_5 ; GFX11-NEXT: .LBB95_3: ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -196359,57 +195686,57 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr56 ; GFX11-NEXT: ; implicit-def: $sgpr58 ; GFX11-NEXT: ; implicit-def: $sgpr60 +; GFX11-NEXT: ; implicit-def: $sgpr62 +; GFX11-NEXT: ; implicit-def: $sgpr72 ; GFX11-NEXT: ; implicit-def: $sgpr104 +; GFX11-NEXT: ; implicit-def: $sgpr74 ; GFX11-NEXT: ; implicit-def: $vcc_hi ; GFX11-NEXT: ; implicit-def: $sgpr103 ; GFX11-NEXT: ; implicit-def: $sgpr102 ; GFX11-NEXT: ; implicit-def: $sgpr101 ; GFX11-NEXT: ; implicit-def: $sgpr98 +; GFX11-NEXT: ; implicit-def: $sgpr76 ; GFX11-NEXT: ; implicit-def: $sgpr100 ; GFX11-NEXT: ; implicit-def: $sgpr97 ; GFX11-NEXT: ; implicit-def: $sgpr96 ; GFX11-NEXT: ; implicit-def: $sgpr87 ; GFX11-NEXT: ; implicit-def: $sgpr85 +; GFX11-NEXT: ; implicit-def: $sgpr78 ; GFX11-NEXT: ; implicit-def: $sgpr86 ; GFX11-NEXT: ; implicit-def: $sgpr84 ; GFX11-NEXT: ; implicit-def: $sgpr83 ; GFX11-NEXT: ; implicit-def: $sgpr82 ; GFX11-NEXT: ; implicit-def: $sgpr80 +; GFX11-NEXT: ; implicit-def: $sgpr88 ; GFX11-NEXT: ; implicit-def: $sgpr81 ; GFX11-NEXT: ; implicit-def: $sgpr71 ; GFX11-NEXT: ; implicit-def: $sgpr70 ; GFX11-NEXT: ; implicit-def: $sgpr69 ; GFX11-NEXT: ; implicit-def: $sgpr67 +; GFX11-NEXT: ; implicit-def: $sgpr90 ; GFX11-NEXT: ; implicit-def: $sgpr68 ; GFX11-NEXT: ; implicit-def: $sgpr66 ; GFX11-NEXT: ; implicit-def: $sgpr65 ; GFX11-NEXT: ; implicit-def: $sgpr64 ; GFX11-NEXT: ; implicit-def: $sgpr54 +; GFX11-NEXT: ; implicit-def: $sgpr92 ; GFX11-NEXT: ; implicit-def: $sgpr55 ; GFX11-NEXT: ; implicit-def: $sgpr53 ; GFX11-NEXT: ; implicit-def: $sgpr52 ; GFX11-NEXT: ; implicit-def: $sgpr51 ; GFX11-NEXT: ; implicit-def: $sgpr49 +; GFX11-NEXT: ; implicit-def: $sgpr94 ; GFX11-NEXT: ; implicit-def: $sgpr50 ; GFX11-NEXT: ; implicit-def: $sgpr48 ; GFX11-NEXT: ; implicit-def: $sgpr39 ; GFX11-NEXT: ; implicit-def: $sgpr38 ; GFX11-NEXT: ; implicit-def: $sgpr36 +; GFX11-NEXT: ; implicit-def: $sgpr30 ; GFX11-NEXT: ; implicit-def: $sgpr37 ; GFX11-NEXT: ; implicit-def: $sgpr35 ; GFX11-NEXT: ; implicit-def: $sgpr34 -; GFX11-NEXT: ; implicit-def: $sgpr72 -; GFX11-NEXT: ; implicit-def: $sgpr62 -; GFX11-NEXT: ; implicit-def: $sgpr30 -; GFX11-NEXT: ; implicit-def: $sgpr94 -; GFX11-NEXT: ; implicit-def: $sgpr92 -; GFX11-NEXT: ; implicit-def: $sgpr90 -; GFX11-NEXT: ; implicit-def: $sgpr88 -; GFX11-NEXT: ; implicit-def: $sgpr78 -; GFX11-NEXT: ; implicit-def: $sgpr76 -; GFX11-NEXT: ; implicit-def: $sgpr74 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-NEXT: v_writelane_b32 v76, s42, 0 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -196420,7 +195747,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: v_writelane_b32 v78, s43, 1 +; GFX11-NEXT: v_writelane_b32 v76, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr43 @@ -196489,295 +195816,295 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: s_branch .LBB95_2 ; GFX11-NEXT: .LBB95_4: -; GFX11-NEXT: v_dual_mov_b32 v52, s0 :: v_dual_mov_b32 v53, s1 -; GFX11-NEXT: v_readlane_b32 s0, v78, 2 -; GFX11-NEXT: v_mov_b32_e32 v71, s50 +; GFX11-NEXT: v_dual_mov_b32 v54, s0 :: v_dual_mov_b32 v55, s1 +; GFX11-NEXT: v_readlane_b32 s0, v76, 2 +; GFX11-NEXT: v_mov_b32_e32 v65, s50 ; GFX11-NEXT: v_dual_mov_b32 v15, s40 :: v_dual_mov_b32 v16, s41 ; GFX11-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v14, s29 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v74, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 3 +; GFX11-NEXT: v_mov_b32_e32 v72, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 3 ; GFX11-NEXT: v_dual_mov_b32 v11, s14 :: v_dual_mov_b32 v12, s15 ; GFX11-NEXT: v_dual_mov_b32 v9, s12 :: v_dual_mov_b32 v10, s13 -; GFX11-NEXT: v_mov_b32_e32 v73, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 4 -; GFX11-NEXT: v_mov_b32_e32 v55, s48 +; GFX11-NEXT: v_mov_b32_e32 v63, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 4 +; GFX11-NEXT: v_mov_b32_e32 v51, s48 ; GFX11-NEXT: v_dual_mov_b32 v7, s10 :: v_dual_mov_b32 v8, s11 ; GFX11-NEXT: v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v6, s9 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v72, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-NEXT: v_mov_b32_e32 v62, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 5 ; GFX11-NEXT: v_mov_b32_e32 v49, s39 ; GFX11-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v4, s7 ; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5 -; GFX11-NEXT: v_mov_b32_e32 v62, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 6 -; GFX11-NEXT: v_dual_mov_b32 v50, s2 :: v_dual_mov_b32 v51, s3 +; GFX11-NEXT: v_mov_b32_e32 v60, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 6 +; GFX11-NEXT: v_dual_mov_b32 v52, s2 :: v_dual_mov_b32 v53, s3 ; GFX11-NEXT: v_dual_mov_b32 v38, s16 :: v_dual_mov_b32 v39, s17 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v63, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 7 -; GFX11-NEXT: v_dual_mov_b32 v35, s38 :: v_dual_mov_b32 v36, s18 -; GFX11-NEXT: v_dual_mov_b32 v37, s19 :: v_dual_mov_b32 v32, s20 -; GFX11-NEXT: v_dual_mov_b32 v33, s21 :: v_dual_mov_b32 v60, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 8 -; GFX11-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_mov_b32 v29, s23 -; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v61, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 9 -; GFX11-NEXT: v_dual_mov_b32 v20, s26 :: v_dual_mov_b32 v21, s27 -; GFX11-NEXT: v_dual_mov_b32 v146, s42 :: v_dual_mov_b32 v145, s104 +; GFX11-NEXT: v_readlane_b32 s0, v76, 7 +; GFX11-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v37, s19 +; GFX11-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21 ; GFX11-NEXT: v_mov_b32_e32 v59, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 10 -; GFX11-NEXT: v_dual_mov_b32 v144, vcc_hi :: v_dual_mov_b32 v135, s103 -; GFX11-NEXT: v_dual_mov_b32 v134, s102 :: v_dual_mov_b32 v133, s101 +; GFX11-NEXT: v_readlane_b32 s0, v76, 8 +; GFX11-NEXT: v_dual_mov_b32 v35, s38 :: v_dual_mov_b32 v28, s22 +; GFX11-NEXT: v_dual_mov_b32 v29, s23 :: v_dual_mov_b32 v24, s24 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v25, s25 :: v_dual_mov_b32 v58, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 9 +; GFX11-NEXT: v_dual_mov_b32 v20, s26 :: v_dual_mov_b32 v21, s27 +; GFX11-NEXT: v_dual_mov_b32 v144, s42 :: v_dual_mov_b32 v135, s104 ; GFX11-NEXT: v_mov_b32_e32 v57, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 11 -; GFX11-NEXT: v_dual_mov_b32 v31, s36 :: v_dual_mov_b32 v132, s98 -; GFX11-NEXT: v_dual_mov_b32 v131, s100 :: v_dual_mov_b32 v130, s97 -; GFX11-NEXT: v_dual_mov_b32 v129, s96 :: v_dual_mov_b32 v58, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 12 -; GFX11-NEXT: v_dual_mov_b32 v27, s37 :: v_dual_mov_b32 v128, s87 -; GFX11-NEXT: v_dual_mov_b32 v119, s85 :: v_dual_mov_b32 v118, s86 +; GFX11-NEXT: v_readlane_b32 s0, v76, 10 +; GFX11-NEXT: v_dual_mov_b32 v134, vcc_hi :: v_dual_mov_b32 v133, s103 +; GFX11-NEXT: v_dual_mov_b32 v132, s102 :: v_dual_mov_b32 v131, s101 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v117, s84 :: v_dual_mov_b32 v56, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 13 -; GFX11-NEXT: v_dual_mov_b32 v116, s83 :: v_dual_mov_b32 v115, s82 -; GFX11-NEXT: v_dual_mov_b32 v114, s80 :: v_dual_mov_b32 v113, s81 ; GFX11-NEXT: v_mov_b32_e32 v47, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 14 -; GFX11-NEXT: v_dual_mov_b32 v23, s35 :: v_dual_mov_b32 v112, s71 -; GFX11-NEXT: v_dual_mov_b32 v103, s70 :: v_dual_mov_b32 v102, s69 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v101, s67 :: v_dual_mov_b32 v46, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 15 -; GFX11-NEXT: v_dual_mov_b32 v19, s34 :: v_dual_mov_b32 v100, s68 -; GFX11-NEXT: v_dual_mov_b32 v99, s66 :: v_dual_mov_b32 v98, s65 -; GFX11-NEXT: v_dual_mov_b32 v97, s64 :: v_dual_mov_b32 v44, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 16 -; GFX11-NEXT: v_dual_mov_b32 v96, s54 :: v_dual_mov_b32 v87, s55 -; GFX11-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v85, s52 +; GFX11-NEXT: v_readlane_b32 s0, v76, 11 +; GFX11-NEXT: v_dual_mov_b32 v31, s36 :: v_dual_mov_b32 v130, s98 +; GFX11-NEXT: v_dual_mov_b32 v129, s100 :: v_dual_mov_b32 v128, s97 +; GFX11-NEXT: v_dual_mov_b32 v119, s96 :: v_dual_mov_b32 v56, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 12 +; GFX11-NEXT: v_dual_mov_b32 v27, s37 :: v_dual_mov_b32 v118, s87 +; GFX11-NEXT: v_dual_mov_b32 v117, s85 :: v_dual_mov_b32 v116, s86 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v115, s84 :: v_dual_mov_b32 v46, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 13 +; GFX11-NEXT: v_dual_mov_b32 v114, s83 :: v_dual_mov_b32 v113, s82 +; GFX11-NEXT: v_dual_mov_b32 v112, s80 :: v_dual_mov_b32 v103, s81 ; GFX11-NEXT: v_mov_b32_e32 v45, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 17 -; GFX11-NEXT: v_dual_mov_b32 v84, s51 :: v_dual_mov_b32 v83, s49 -; GFX11-NEXT: v_dual_mov_b32 v147, s43 :: v_dual_mov_b32 v22, s78 +; GFX11-NEXT: v_readlane_b32 s0, v76, 14 +; GFX11-NEXT: v_dual_mov_b32 v23, s35 :: v_dual_mov_b32 v102, s71 +; GFX11-NEXT: v_dual_mov_b32 v101, s70 :: v_dual_mov_b32 v100, s69 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v99, s67 :: v_dual_mov_b32 v44, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 15 +; GFX11-NEXT: v_dual_mov_b32 v19, s34 :: v_dual_mov_b32 v98, s68 +; GFX11-NEXT: v_dual_mov_b32 v97, s66 :: v_dual_mov_b32 v96, s65 +; GFX11-NEXT: v_dual_mov_b32 v87, s64 :: v_dual_mov_b32 v42, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 16 +; GFX11-NEXT: v_dual_mov_b32 v86, s54 :: v_dual_mov_b32 v85, s55 +; GFX11-NEXT: v_dual_mov_b32 v84, s53 :: v_dual_mov_b32 v83, s52 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v43, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 18 -; GFX11-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88 -; GFX11-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v42, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 19 -; GFX11-NEXT: v_readlane_b32 s1, v78, 1 -; GFX11-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92 -; GFX11-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30 +; GFX11-NEXT: v_readlane_b32 s0, v76, 17 +; GFX11-NEXT: v_dual_mov_b32 v82, s51 :: v_dual_mov_b32 v71, s49 +; GFX11-NEXT: v_dual_mov_b32 v145, s43 :: v_dual_mov_b32 v26, s90 ; GFX11-NEXT: v_mov_b32_e32 v41, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 20 -; GFX11-NEXT: v_mov_b32_e32 v48, s62 -; GFX11-NEXT: v_mov_b32_e32 v54, s72 -; GFX11-NEXT: v_mov_b32_e32 v64, s60 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v70, s56 :: v_dual_mov_b32 v183, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 21 -; GFX11-NEXT: v_mov_b32_e32 v80, s46 -; GFX11-NEXT: v_mov_b32_e32 v18, s76 +; GFX11-NEXT: v_readlane_b32 s0, v76, 18 +; GFX11-NEXT: v_dual_mov_b32 v67, s60 :: v_dual_mov_b32 v30, s88 +; GFX11-NEXT: v_dual_mov_b32 v69, s56 :: v_dual_mov_b32 v34, s78 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v40, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 22 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v76, 19 +; GFX11-NEXT: v_readlane_b32 s1, v76, 1 +; GFX11-NEXT: v_dual_mov_b32 v17, s30 :: v_dual_mov_b32 v64, s74 +; GFX11-NEXT: v_dual_mov_b32 v50, s72 :: v_dual_mov_b32 v183, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 20 +; GFX11-NEXT: v_mov_b32_e32 v66, s62 +; GFX11-NEXT: v_mov_b32_e32 v68, s58 +; GFX11-NEXT: v_mov_b32_e32 v70, s46 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_mov_b32 v80, s44 :: v_dual_mov_b32 v181, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 21 +; GFX11-NEXT: v_mov_b32_e32 v18, s94 +; GFX11-NEXT: v_mov_b32_e32 v22, s92 +; GFX11-NEXT: v_mov_b32_e32 v48, s76 ; GFX11-NEXT: v_mov_b32_e32 v182, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 23 -; GFX11-NEXT: v_mov_b32_e32 v181, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 24 +; GFX11-NEXT: v_readlane_b32 s0, v76, 22 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v180, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 25 -; GFX11-NEXT: v_mov_b32_e32 v178, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v76, 23 ; GFX11-NEXT: v_mov_b32_e32 v179, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 27 -; GFX11-NEXT: v_mov_b32_e32 v177, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 28 +; GFX11-NEXT: v_readlane_b32 s0, v76, 24 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v178, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 25 ; GFX11-NEXT: v_mov_b32_e32 v176, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 29 +; GFX11-NEXT: v_readlane_b32 s0, v76, 26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v177, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 27 ; GFX11-NEXT: v_mov_b32_e32 v167, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 30 +; GFX11-NEXT: v_readlane_b32 s0, v76, 28 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v165, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 31 ; GFX11-NEXT: v_mov_b32_e32 v166, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 29 +; GFX11-NEXT: v_mov_b32_e32 v165, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 30 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v164, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 1 ; GFX11-NEXT: v_mov_b32_e32 v163, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-NEXT: v_readlane_b32 s0, v76, 31 +; GFX11-NEXT: v_mov_b32_e32 v164, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v162, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 3 -; GFX11-NEXT: v_mov_b32_e32 v160, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v75, 1 ; GFX11-NEXT: v_mov_b32_e32 v161, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 5 -; GFX11-NEXT: v_mov_b32_e32 v151, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-NEXT: v_readlane_b32 s0, v75, 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v160, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 3 ; GFX11-NEXT: v_mov_b32_e32 v150, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-NEXT: v_readlane_b32 s0, v75, 4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v151, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 5 ; GFX11-NEXT: v_mov_b32_e32 v149, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-NEXT: v_readlane_b32 s0, v75, 6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v148, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 0 -; GFX11-NEXT: v_mov_b32_e32 v82, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 7 +; GFX11-NEXT: v_mov_b32_e32 v147, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v146, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 0 +; GFX11-NEXT: v_mov_b32_e32 v81, s0 ; GFX11-NEXT: .LBB95_5: ; %end -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v74 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v52 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v82 -; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v63 +; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v54 +; GFX11-NEXT: v_lshlrev_b32_e32 v72, 8, v72 +; GFX11-NEXT: v_and_b32_e32 v63, 0xff, v63 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v81 -; GFX11-NEXT: v_or_b32_e32 v52, v52, v69 -; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v73 -; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v50 -; GFX11-NEXT: v_and_b32_e32 v57, 0xff, v57 -; GFX11-NEXT: v_lshlrev_b32_e32 v58, 8, v58 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v52 -; GFX11-NEXT: v_or_b32_e32 v66, v69, v66 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v72 -; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v62, 8, v62 +; GFX11-NEXT: v_or_b32_e32 v54, v54, v72 +; GFX11-NEXT: v_and_b32_e32 v60, 0xff, v60 +; GFX11-NEXT: v_or_b32_e32 v81, v63, v81 +; GFX11-NEXT: v_lshlrev_b32_e32 v61, 8, v61 +; GFX11-NEXT: v_or_b32_e32 v55, v55, v62 +; GFX11-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-NEXT: v_lshlrev_b32_e32 v59, 8, v59 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v81 +; GFX11-NEXT: v_or_b32_e32 v60, v60, v61 +; GFX11-NEXT: v_and_b32_e32 v61, 0xff, v52 +; GFX11-NEXT: v_and_b32_e32 v58, 0xff, v58 ; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v80 -; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v66 -; GFX11-NEXT: v_or_b32_e32 v53, v53, v69 -; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v62 +; GFX11-NEXT: v_or_b32_e32 v52, v54, v81 +; GFX11-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v60 +; GFX11-NEXT: v_or_b32_e32 v81, v61, v59 +; GFX11-NEXT: v_or_b32_e32 v80, v58, v80 +; GFX11-NEXT: v_and_b32_e32 v58, 0xff, v53 +; GFX11-NEXT: v_lshlrev_b32_e32 v57, 8, v57 +; GFX11-NEXT: v_and_b32_e32 v47, 0xff, v47 +; GFX11-NEXT: v_lshlrev_b32_e32 v56, 8, v56 +; GFX11-NEXT: v_or_b32_e32 v53, v54, v55 +; GFX11-NEXT: v_and_b32_e32 v54, 0xffff, v81 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v80 +; GFX11-NEXT: v_or_b32_e32 v80, v58, v57 +; GFX11-NEXT: v_or_b32_e32 v81, v47, v56 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-NEXT: v_lshlrev_b32_e32 v46, 8, v46 +; GFX11-NEXT: v_and_b32_e32 v45, 0xff, v45 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v70 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v67 -; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-NEXT: v_or_b32_e32 v69, v69, v82 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v60 -; GFX11-NEXT: v_and_b32_e32 v60, 0xff, v61 -; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v64 -; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-NEXT: v_or_b32_e32 v82, v50, v82 -; GFX11-NEXT: v_or_b32_e32 v81, v60, v81 -; GFX11-NEXT: v_or_b32_e32 v50, v52, v66 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v69 -; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v51 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v59 -; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-NEXT: v_or_b32_e32 v51, v52, v53 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v82 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v81 -; GFX11-NEXT: v_or_b32_e32 v66, v66, v69 -; GFX11-NEXT: v_or_b32_e32 v69, v57, v58 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v56 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v47 -; GFX11-NEXT: v_or_b32_e32 v52, v52, v53 -; GFX11-NEXT: v_and_b32_e32 v53, 0xffff, v66 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v38, v38, v81 -; GFX11-NEXT: v_or_b32_e32 v69, v82, v80 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v46 -; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v44 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v45 -; GFX11-NEXT: v_or_b32_e32 v53, v53, v66 +; GFX11-NEXT: v_or_b32_e32 v54, v54, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xffff, v80 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v81 +; GFX11-NEXT: v_or_b32_e32 v38, v38, v46 +; GFX11-NEXT: v_or_b32_e32 v70, v45, v70 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v44 +; GFX11-NEXT: v_and_b32_e32 v42, 0xff, v42 +; GFX11-NEXT: v_lshlrev_b32_e32 v43, 8, v43 +; GFX11-NEXT: v_or_b32_e32 v55, v55, v80 ; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v39, v39, v80 -; GFX11-NEXT: v_or_b32_e32 v69, v81, v82 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v43 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v42 -; GFX11-NEXT: v_or_b32_e32 v36, v38, v66 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v70 +; GFX11-NEXT: v_or_b32_e32 v39, v39, v81 +; GFX11-NEXT: v_or_b32_e32 v80, v42, v43 +; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v36 +; GFX11-NEXT: v_lshlrev_b32_e32 v41, 8, v41 +; GFX11-NEXT: v_and_b32_e32 v40, 0xff, v40 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v69 +; GFX11-NEXT: v_or_b32_e32 v36, v38, v70 ; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v39 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v66, v80, v81 -; GFX11-NEXT: v_or_b32_e32 v69, v82, v70 -; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v41 -; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v183 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v40 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v80 +; GFX11-NEXT: v_or_b32_e32 v70, v81, v41 +; GFX11-NEXT: v_or_b32_e32 v69, v40, v69 +; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v37 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v183 +; GFX11-NEXT: v_and_b32_e32 v181, 0xff, v181 +; GFX11-NEXT: v_lshlrev_b32_e32 v182, 8, v182 ; GFX11-NEXT: v_or_b32_e32 v37, v38, v39 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v66 +; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v70 ; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v66, v70, v80 -; GFX11-NEXT: v_or_b32_e32 v69, v81, v82 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v181 -; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v182 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v180 -; GFX11-NEXT: v_and_b32_e32 v66, 0xffff, v66 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v67, v80, v67 -; GFX11-NEXT: v_or_b32_e32 v32, v32, v70 -; GFX11-NEXT: v_or_b32_e32 v33, v33, v81 +; GFX11-NEXT: v_or_b32_e32 v69, v80, v81 +; GFX11-NEXT: v_or_b32_e32 v70, v181, v182 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v180 +; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v179 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-NEXT: v_and_b32_e32 v69, 0xffff, v69 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v70 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v178, 8, v178 +; GFX11-NEXT: v_or_b32_e32 v32, v32, v80 +; GFX11-NEXT: v_or_b32_e32 v68, v81, v68 ; GFX11-NEXT: v_or_b32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v39, v66, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v67 -; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v178 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v179 +; GFX11-NEXT: v_or_b32_e32 v39, v69, v70 +; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v176 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v177 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v176 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v167 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v165 -; GFX11-NEXT: v_lshlrev_b32_e32 v165, 8, v166 -; GFX11-NEXT: v_or_b32_e32 v67, v67, v69 -; GFX11-NEXT: v_or_b32_e32 v28, v28, v70 -; GFX11-NEXT: v_or_b32_e32 v64, v80, v64 -; GFX11-NEXT: v_or_b32_e32 v29, v29, v81 -; GFX11-NEXT: v_or_b32_e32 v69, v82, v165 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v167 +; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v166 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v67 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v165, 8, v165 +; GFX11-NEXT: v_and_b32_e32 v163, 0xff, v163 +; GFX11-NEXT: v_lshlrev_b32_e32 v164, 8, v164 +; GFX11-NEXT: v_or_b32_e32 v33, v33, v178 +; GFX11-NEXT: v_or_b32_e32 v69, v69, v70 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v80 +; GFX11-NEXT: v_or_b32_e32 v67, v81, v67 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v165 +; GFX11-NEXT: v_or_b32_e32 v70, v163, v164 ; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v68 ; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v67 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v69 ; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v64 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v67 ; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v69 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v70 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[50:53], off +; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off ; GFX11-NEXT: scratch_store_b128 v0, v[36:39], off offset:16 -; GFX11-NEXT: v_or_b32_e32 v36, v32, v66 -; GFX11-NEXT: v_or_b32_e32 v37, v33, v67 -; GFX11-NEXT: v_or_b32_e32 v38, v28, v64 -; GFX11-NEXT: v_or_b32_e32 v39, v29, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v164 -; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v163 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v54 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v162 -; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v160 -; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v161 +; GFX11-NEXT: v_or_b32_e32 v36, v32, v68 +; GFX11-NEXT: v_or_b32_e32 v37, v33, v69 +; GFX11-NEXT: v_or_b32_e32 v38, v28, v67 +; GFX11-NEXT: v_or_b32_e32 v39, v29, v70 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v162 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v161 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v160 +; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v150 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v151 ; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v151 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v149 ; GFX11-NEXT: v_or_b32_e32 v24, v24, v28 ; GFX11-NEXT: v_or_b32_e32 v28, v29, v32 ; GFX11-NEXT: v_or_b32_e32 v25, v25, v33 -; GFX11-NEXT: v_or_b32_e32 v29, v50, v51 -; GFX11-NEXT: v_or_b32_e32 v20, v20, v52 -; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v150 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v48 +; GFX11-NEXT: v_or_b32_e32 v29, v52, v53 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v54 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v148 +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v50 ; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v149 -; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v148 -; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v147 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v147 +; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v146 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v145 ; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v146 -; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v145 -; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v144 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v135 +; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v64 ; GFX11-NEXT: v_or_b32_e32 v32, v32, v33 -; GFX11-NEXT: v_or_b32_e32 v21, v21, v48 -; GFX11-NEXT: v_or_b32_e32 v33, v50, v51 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v52 -; GFX11-NEXT: v_or_b32_e32 v48, v53, v54 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v50 +; GFX11-NEXT: v_or_b32_e32 v33, v52, v53 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v54 +; GFX11-NEXT: v_or_b32_e32 v50, v55, v64 ; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 @@ -196785,39 +196112,39 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v50 ; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-NEXT: v_or_b32_e32 v50, v24, v28 -; GFX11-NEXT: v_or_b32_e32 v52, v20, v32 -; GFX11-NEXT: v_or_b32_e32 v53, v21, v33 -; GFX11-NEXT: v_or_b32_e32 v64, v15, v48 +; GFX11-NEXT: v_or_b32_e32 v52, v24, v28 +; GFX11-NEXT: v_or_b32_e32 v54, v20, v32 +; GFX11-NEXT: v_or_b32_e32 v55, v21, v33 +; GFX11-NEXT: v_or_b32_e32 v66, v15, v50 ; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v16 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v144 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v135 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v134 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v134 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v133 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v132 ; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v133 -; GFX11-NEXT: v_or_b32_e32 v51, v25, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v132 -; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v65 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v131 +; GFX11-NEXT: v_or_b32_e32 v53, v25, v29 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v130 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v48 ; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v131 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v129 ; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-NEXT: v_or_b32_e32 v16, v20, v21 ; GFX11-NEXT: v_or_b32_e32 v13, v13, v24 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v130 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v129 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v128 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v119 ; GFX11-NEXT: v_or_b32_e32 v20, v25, v28 ; GFX11-NEXT: v_or_b32_e32 v14, v14, v29 ; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v128 -; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v119 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v118 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v117 ; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v34 ; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v118 -; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v117 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v116 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v116 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v115 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v114 ; GFX11-NEXT: v_or_b32_e32 v21, v21, v24 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -196833,36 +196160,36 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-NEXT: v_or_b32_e32 v65, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v66, v13, v20 -; GFX11-NEXT: v_or_b32_e32 v67, v14, v21 +; GFX11-NEXT: v_or_b32_e32 v67, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v68, v13, v20 +; GFX11-NEXT: v_or_b32_e32 v69, v14, v21 ; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v115 -; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v113 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v112 ; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v30 ; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v113 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v112 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v103 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v103 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v102 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v101 ; GFX11-NEXT: v_or_b32_e32 v11, v11, v24 ; GFX11-NEXT: v_or_b32_e32 v12, v12, v25 ; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v102 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v100 ; GFX11-NEXT: v_or_b32_e32 v9, v9, v13 ; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 ; GFX11-NEXT: v_or_b32_e32 v10, v10, v16 ; GFX11-NEXT: v_or_b32_e32 v14, v20, v21 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v101 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v99 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v26 ; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v100 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v98 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v97 -; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v96 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v87 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v86 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; GFX11-NEXT: v_or_b32_e32 v7, v7, v24 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v99 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v98 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v97 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v96 ; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-NEXT: v_or_b32_e32 v8, v8, v20 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v25 @@ -196883,16 +196210,16 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_or_b32_e32 v7, v7, v15 ; GFX11-NEXT: v_or_b32_e32 v9, v5, v20 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v87 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v86 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v85 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v83 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v84 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v83 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v71 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; GFX11-NEXT: v_or_b32_e32 v8, v8, v16 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v84 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v82 ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v71 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v65 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX11-NEXT: v_or_b32_e32 v6, v10, v15 ; GFX11-NEXT: v_or_b32_e32 v10, v20, v18 @@ -196901,7 +196228,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v49 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v51 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v35 ; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v31 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v17 @@ -196930,78 +196257,76 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_or_b32_e32 v4, v20, v17 ; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: scratch_store_b128 v0, v[36:39], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[50:53], off offset:48 -; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[66:69], off offset:64 ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v74, off, s32 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:72 -; GFX11-NEXT: v_readlane_b32 s104, v76, 8 -; GFX11-NEXT: v_readlane_b32 s103, v76, 7 -; GFX11-NEXT: v_readlane_b32 s102, v76, 6 -; GFX11-NEXT: v_readlane_b32 s101, v76, 5 -; GFX11-NEXT: v_readlane_b32 s100, v76, 4 -; GFX11-NEXT: v_readlane_b32 s99, v76, 3 -; GFX11-NEXT: v_readlane_b32 s98, v76, 2 -; GFX11-NEXT: v_readlane_b32 s97, v76, 1 -; GFX11-NEXT: v_readlane_b32 s96, v76, 0 -; GFX11-NEXT: v_readlane_b32 s87, v75, 31 -; GFX11-NEXT: v_readlane_b32 s86, v75, 30 -; GFX11-NEXT: v_readlane_b32 s85, v75, 29 -; GFX11-NEXT: v_readlane_b32 s84, v75, 28 -; GFX11-NEXT: v_readlane_b32 s83, v75, 27 -; GFX11-NEXT: v_readlane_b32 s82, v75, 26 -; GFX11-NEXT: v_readlane_b32 s81, v75, 25 -; GFX11-NEXT: v_readlane_b32 s80, v75, 24 -; GFX11-NEXT: v_readlane_b32 s71, v75, 23 -; GFX11-NEXT: v_readlane_b32 s70, v75, 22 -; GFX11-NEXT: v_readlane_b32 s69, v75, 21 -; GFX11-NEXT: v_readlane_b32 s68, v75, 20 -; GFX11-NEXT: v_readlane_b32 s67, v75, 19 -; GFX11-NEXT: v_readlane_b32 s66, v75, 18 -; GFX11-NEXT: v_readlane_b32 s65, v75, 17 -; GFX11-NEXT: v_readlane_b32 s64, v75, 16 -; GFX11-NEXT: v_readlane_b32 s55, v75, 15 -; GFX11-NEXT: v_readlane_b32 s54, v75, 14 -; GFX11-NEXT: v_readlane_b32 s53, v75, 13 -; GFX11-NEXT: v_readlane_b32 s52, v75, 12 -; GFX11-NEXT: v_readlane_b32 s51, v75, 11 -; GFX11-NEXT: v_readlane_b32 s50, v75, 10 -; GFX11-NEXT: v_readlane_b32 s49, v75, 9 -; GFX11-NEXT: v_readlane_b32 s48, v75, 8 -; GFX11-NEXT: v_readlane_b32 s39, v75, 7 -; GFX11-NEXT: v_readlane_b32 s38, v75, 6 -; GFX11-NEXT: v_readlane_b32 s37, v75, 5 -; GFX11-NEXT: v_readlane_b32 s36, v75, 4 -; GFX11-NEXT: v_readlane_b32 s35, v75, 3 -; GFX11-NEXT: v_readlane_b32 s34, v75, 2 -; GFX11-NEXT: v_readlane_b32 s31, v75, 1 -; GFX11-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-NEXT: s_clause 0x10 ; 68-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v72, off, s32 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:64 +; GFX11-NEXT: v_readlane_b32 s104, v74, 8 +; GFX11-NEXT: v_readlane_b32 s103, v74, 7 +; GFX11-NEXT: v_readlane_b32 s102, v74, 6 +; GFX11-NEXT: v_readlane_b32 s101, v74, 5 +; GFX11-NEXT: v_readlane_b32 s100, v74, 4 +; GFX11-NEXT: v_readlane_b32 s99, v74, 3 +; GFX11-NEXT: v_readlane_b32 s98, v74, 2 +; GFX11-NEXT: v_readlane_b32 s97, v74, 1 +; GFX11-NEXT: v_readlane_b32 s96, v74, 0 +; GFX11-NEXT: v_readlane_b32 s87, v73, 31 +; GFX11-NEXT: v_readlane_b32 s86, v73, 30 +; GFX11-NEXT: v_readlane_b32 s85, v73, 29 +; GFX11-NEXT: v_readlane_b32 s84, v73, 28 +; GFX11-NEXT: v_readlane_b32 s83, v73, 27 +; GFX11-NEXT: v_readlane_b32 s82, v73, 26 +; GFX11-NEXT: v_readlane_b32 s81, v73, 25 +; GFX11-NEXT: v_readlane_b32 s80, v73, 24 +; GFX11-NEXT: v_readlane_b32 s71, v73, 23 +; GFX11-NEXT: v_readlane_b32 s70, v73, 22 +; GFX11-NEXT: v_readlane_b32 s69, v73, 21 +; GFX11-NEXT: v_readlane_b32 s68, v73, 20 +; GFX11-NEXT: v_readlane_b32 s67, v73, 19 +; GFX11-NEXT: v_readlane_b32 s66, v73, 18 +; GFX11-NEXT: v_readlane_b32 s65, v73, 17 +; GFX11-NEXT: v_readlane_b32 s64, v73, 16 +; GFX11-NEXT: v_readlane_b32 s55, v73, 15 +; GFX11-NEXT: v_readlane_b32 s54, v73, 14 +; GFX11-NEXT: v_readlane_b32 s53, v73, 13 +; GFX11-NEXT: v_readlane_b32 s52, v73, 12 +; GFX11-NEXT: v_readlane_b32 s51, v73, 11 +; GFX11-NEXT: v_readlane_b32 s50, v73, 10 +; GFX11-NEXT: v_readlane_b32 s49, v73, 9 +; GFX11-NEXT: v_readlane_b32 s48, v73, 8 +; GFX11-NEXT: v_readlane_b32 s39, v73, 7 +; GFX11-NEXT: v_readlane_b32 s38, v73, 6 +; GFX11-NEXT: v_readlane_b32 s37, v73, 5 +; GFX11-NEXT: v_readlane_b32 s36, v73, 4 +; GFX11-NEXT: v_readlane_b32 s35, v73, 3 +; GFX11-NEXT: v_readlane_b32 s34, v73, 2 +; GFX11-NEXT: v_readlane_b32 s31, v73, 1 +; GFX11-NEXT: v_readlane_b32 s30, v73, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:72 ; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:88 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -197044,18 +196369,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v51, v15 ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, v5 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:392 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v5 +; SI-NEXT: v_mov_b32_e32 v54, v3 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:392 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 @@ -197074,20 +196400,21 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v16 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 @@ -197096,38 +196423,37 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 ; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v20 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:364 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -197135,349 +196461,341 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v34 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v27 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:160 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:348 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:376 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:384 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:384 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:368 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:8 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:48 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -197489,15 +196807,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -197505,7 +196823,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -197513,7 +196831,17 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -197545,870 +196873,870 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB96_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v57 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v8, v16, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v41 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v2, v2, v5 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v13, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v5, v2, v5 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v5, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v2, v6, v7 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v18, v6, v7 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v11, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v6, v7 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v8, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v10, v22, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v15, v24, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v18, v36, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v19, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v28, v22, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v30, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v10, v24, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v9, v30 +; SI-NEXT: v_mov_b32_e32 v30, v34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v33, v2, v9 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v6, v9 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v23, v7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v17, v9, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v33, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v36, v6, v9 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v6, v9 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v49, v7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v21, v9, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v39, v6, v9 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v6, v9 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v52, v7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v50, v9, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v14, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v49, v6, v9 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v53, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v52, v6, v9 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v55, v7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v53, v9, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v43, v3, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v9, v6, v9 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v55, v6, v14 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v16, v3, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v43, v14, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v14, v6, v14 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v45, v3, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v44, v6, v15 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v47, v7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v45, v15, v42 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v24, v3, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v6, v15 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v58, v3, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v47, v6, v15 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v59, v7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v58, v15, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_mov_b32_e32 v3, v34 -; SI-NEXT: v_or_b32_e32 v34, v6, v7 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v24, v6, v15 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v6, v7 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v59, v6, v15 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v48, v7, v6 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v6 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v35, v6, v7 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v54, v6, v7 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v41, v7, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v37, v37, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v57, v51, v7 +; SI-NEXT: v_or_b32_e32 v63, v15, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v63, v7, v63 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v51, v56, v7 +; SI-NEXT: v_or_b32_e32 v27, v6, v15 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v56, v60, v7 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v37, v56, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v60, v7, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v48, v15, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v9, v9, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v7, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v26, v12, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v12, 0xff, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v32, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v62, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v46, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v61, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v40, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v31, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v42, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v22, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v25, v6, v13 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v5 -; SI-NEXT: v_alignbit_b32 v7, v25, v5, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v6, v6, v11 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v5, v11, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v34, v2, v15 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v51, v2, v15 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v6, v11, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v11, v11, v15 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v11, v6, v15, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v26, v11, v18 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v11, v11, v19 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v11, v26, v19, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v28, v11, v30 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v20, v11, v21 -; SI-NEXT: v_alignbit_b32 v11, v28, v21, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v29, v11, v33 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v62 -; SI-NEXT: v_or_b32_e32 v23, v11, v27 -; SI-NEXT: v_alignbit_b32 v11, v29, v27, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v19, v11, v38 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v46 -; SI-NEXT: v_or_b32_e32 v27, v11, v50 -; SI-NEXT: v_alignbit_b32 v11, v19, v50, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v21, v15, v53 -; SI-NEXT: v_alignbit_b32 v15, v11, v53, 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v15, v15, v2 -; SI-NEXT: v_or_b32_e32 v46, v3, v43 -; SI-NEXT: v_alignbit_b32 v3, v15, v43, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 -; SI-NEXT: v_or_b32_e32 v17, v3, v16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v39, v3, v45 -; SI-NEXT: v_alignbit_b32 v3, v17, v45, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v61, v3, v24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v3, v3, v58 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v61, v58, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 -; SI-NEXT: v_or_b32_e32 v62, v3, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v40, v3, v36 -; SI-NEXT: v_alignbit_b32 v3, v62, v36, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v59, v3, v35 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v31, v3, v54 -; SI-NEXT: v_alignbit_b32 v3, v59, v54, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v47, v3, v37 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 -; SI-NEXT: v_or_b32_e32 v25, v3, v57 -; SI-NEXT: v_alignbit_b32 v3, v47, v57, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v45, v3, v51 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v22, v3, v56 -; SI-NEXT: v_alignbit_b32 v3, v45, v56, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v44, v3, v4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v54, v15, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v35, v35, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v41, v38, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v62, v15, v62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v38, v57, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v57, v3, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v3, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v15, v15, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v29, v16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v56, v16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v61, v16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v46, v16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v60, v16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v32, v16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v40, v16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v31, v16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v30, v20, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v42, v20, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v20, v20, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v6, v11 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v5 +; SI-NEXT: v_alignbit_b32 v5, v2, v5, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v15, v5, v18 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v5, v15, v7, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v5, v28 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v7, v5, v10 +; SI-NEXT: v_alignbit_b32 v5, v6, v10, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v26, v5, v33 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v10, v5, v13 +; SI-NEXT: v_alignbit_b32 v5, v26, v13, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v13, v5, v36 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v56, v5, v19 +; SI-NEXT: v_alignbit_b32 v5, v13, v19, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v29, v5, v39 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v8, v5, v23 +; SI-NEXT: v_alignbit_b32 v5, v29, v23, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v19, v5, v49 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v21, v5, v52 +; SI-NEXT: v_alignbit_b32 v5, v19, v52, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v46, v5, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v60 +; SI-NEXT: v_or_b32_e32 v60, v5, v55 +; SI-NEXT: v_alignbit_b32 v5, v46, v55, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v5, v5, v14 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v32, v17, v44 +; SI-NEXT: v_alignbit_b32 v17, v5, v44, 16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v17, v17, v22 +; SI-NEXT: v_alignbit_b32 v25, v17, v47, 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v31, v59 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v63, v31, v27 +; SI-NEXT: v_and_b32_e32 v50, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v31, v16, v37 +; SI-NEXT: v_alignbit_b32 v16, v63, v37, 16 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v40, v50, v24 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v25, v40, v59, 16 +; SI-NEXT: v_or_b32_e32 v59, v16, v34 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v37, v16, v51 +; SI-NEXT: v_alignbit_b32 v16, v59, v51, 16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v23, v23, v47 +; SI-NEXT: v_or_b32_e32 v47, v16, v35 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v42 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v16, v41 +; SI-NEXT: v_alignbit_b32 v16, v47, v41, 16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v45, v16, v38 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v42, v16, v57 +; SI-NEXT: v_alignbit_b32 v16, v45, v57, 16 +; SI-NEXT: v_or_b32_e32 v44, v4, v3 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v16, v1, v12 +; SI-NEXT: v_alignbit_b32 v1, v44, v12, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: v_mov_b32_e32 v48, v2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: v_mov_b32_e32 v9, v42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: v_or_b32_e32 v12, v1, v9 -; SI-NEXT: v_alignbit_b32 v1, v44, v9, 16 -; SI-NEXT: ; kill: killed $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 -; SI-NEXT: v_mov_b32_e32 v13, v25 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_mov_b32_e32 v14, v31 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v35 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_mov_b32_e32 v14, v25 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v34 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: v_mov_b32_e32 v11, v21 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v8, v56 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: .LBB96_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB96_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_mov_b32_e32 v30, v16 -; SI-NEXT: v_mov_b32_e32 v33, v31 -; SI-NEXT: v_mov_b32_e32 v31, v22 +; SI-NEXT: v_mov_b32_e32 v28, v22 +; SI-NEXT: v_mov_b32_e32 v33, v30 +; SI-NEXT: v_mov_b32_e32 v30, v24 ; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v2 +; SI-NEXT: v_or_b32_e32 v5, v5, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x300, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_or_b32_e32 v23, v2, v3 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v23, v2, v5 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_or_b32_e32 v4, v32, v4 +; SI-NEXT: v_or_b32_e32 v4, v61, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_mov_b32_e32 v32, v24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v44, vcc, s7, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v60, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v4, v63, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_or_b32_e32 v5, v57, v5 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v45, vcc, s7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v45 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 @@ -198419,10 +197747,10 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v51, v6 +; SI-NEXT: v_or_b32_e32 v6, v38, v6 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v5 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 @@ -198434,7 +197762,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 +; SI-NEXT: v_or_b32_e32 v7, v35, v7 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v47, vcc, s7, v6 @@ -198442,7 +197770,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -198451,15 +197779,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v7 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -198468,7 +197796,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v59, vcc, s7, v8 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -198476,25 +197804,24 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v10, v56, v10 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v9 -; SI-NEXT: v_mov_b32_e32 v40, v8 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v36, vcc, s7, v9 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v31, v36 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -198503,9 +197830,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v62, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v63, vcc, s7, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 @@ -198522,7 +197849,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v11 +; SI-NEXT: v_add_i32_e32 v37, vcc, s7, v11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 @@ -198537,15 +197864,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v61, vcc, s7, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v40, vcc, s7, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v40 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -198554,68 +197882,60 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v13 -; SI-NEXT: v_mov_b32_e32 v13, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_or_b32_e32 v14, v42, v14 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v46, vcc, s7, v15 +; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v27 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -198624,43 +197944,41 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v18, v18, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v60, vcc, s7, v18 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: v_or_b32_e32 v19, v19, v17 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v19 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v46, vcc, s7, v19 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v46 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v20, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v21, v17 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_or_b32_e32 v20, v20, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v20 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 @@ -198675,17 +197993,17 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v26, v21 ; SI-NEXT: v_or_b32_e32 v22, v21, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_mov_b32_e32 v34, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v22 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v17, v21, v17 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -198694,79 +198012,82 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v26, v21 ; SI-NEXT: v_or_b32_e32 v24, v21, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v17, v21, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v21, v26, v21 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v27, v21, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v27 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v17, v21, v17 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v1, v17 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v26, v21 -; SI-NEXT: v_or_b32_e32 v29, v21, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v26, v21, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v26 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v21, v17 -; SI-NEXT: v_or_b32_e32 v2, v28, v2 -; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v23 -; SI-NEXT: v_mov_b32_e32 v12, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v14 ; SI-NEXT: v_mov_b32_e32 v14, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v24 -; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v13 +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v26 +; SI-NEXT: v_mov_b32_e32 v16, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v24 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v2 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -198774,30 +198095,30 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v2, v33, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v1, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v63 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 @@ -198806,11 +198127,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v32, v3 +; SI-NEXT: v_or_b32_e32 v3, v30, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -198823,63 +198144,66 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v31, v3 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v59 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_or_b32_e32 v2, v34, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v30, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v2, v33, v2 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v3, v1, v3 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v2, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -198889,104 +198213,103 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v3, v1, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v2 -; SI-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v6, v16, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v26, v10, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v15, v12, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v28, v20, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v6, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v29, v23, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v26, v10, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v19, v27, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v13, v8, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v11, v21, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v29, v21, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v15, v46, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v19, v11, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v46, v60, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v17, v39, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v5, v32, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v17, v23, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v61, v9, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v40, v37, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v62, v8, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v63, v36, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v59, v14, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v59, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v47, v13, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v47, v14, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v45, v35, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v45, v9, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v44, v12, 16 -; SI-NEXT: v_alignbit_b32 v3, v1, v7, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v47 +; SI-NEXT: v_alignbit_b32 v1, v44, v16, 16 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_add_i32_e32 v48, vcc, s7, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_alignbit_b32 v2, v48, v4, 16 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; SI-NEXT: .LBB96_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -198996,13 +198319,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -199012,25 +198333,25 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -199040,11 +198361,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -199054,80 +198375,81 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -199135,41 +198457,45 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -199221,22 +198547,22 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 @@ -199254,148 +198580,148 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v1 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:184 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v5 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v9 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v11 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v13 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v15 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 -; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 -; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9 -; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 -; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v15 -; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v17 -; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v19 -; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v14 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v49 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v51 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v33 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v35 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v53 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v43 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v36 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 @@ -199403,506 +198729,410 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v3 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:332 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:340 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v3 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:348 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:356 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:364 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:372 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v0 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v61, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v0 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:44 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:20 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:12 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:60 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB96_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v28, v33, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v48, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v54, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v40, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v46, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v14, v37, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v15, v39, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: v_or_b32_sdwa v26, v26, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v4, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v5, v5, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v63, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v60, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v10, v56, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v58, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v12, v63, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v34, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v35, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v38, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v25, v25, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v26, v26, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v36, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v50, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; kill: killed $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v30, v30, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v55, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v31, v31, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v42, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; kill: killed $vgpr32 @@ -200007,398 +199237,553 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; kill: killed $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: .LBB96_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v18, 0x300 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u16_e32 v0, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: v_or_b32_sdwa v29, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v0, 3, v46 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 3, v36 +; VI-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v4, 3, v33 +; VI-NEXT: v_add_u16_e32 v37, 3, v37 +; VI-NEXT: v_add_u16_e32 v35, 3, v35 +; VI-NEXT: v_add_u16_e32 v34, 3, v34 +; VI-NEXT: v_add_u16_e32 v3, 3, v50 +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v3, 3, v48 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v2, 3, v55 +; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v2, 3, v54 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v1, 3, v42 +; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v1, 3, v40 +; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v57, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v4, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v0, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v29, 0x300, v29 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v9, 3, v9 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v9, 3, v9 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v2, 0x300, v3 -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v10, 3, v10 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v10, 3, v10 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v2, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v1, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v4 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 -; VI-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v3, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_u16_e32 v4, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v37, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_u16_e32 v6, 3, v6 -; VI-NEXT: v_or_b32_sdwa v6, v33, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v11, 3, v11 +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v5, 3, v5 -; VI-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_add_u16_e32 v11, 3, v11 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v4, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v4, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v51, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v12, 3, v12 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v49, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v5, 3, v5 -; VI-NEXT: v_or_b32_sdwa v5, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v5, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v6, v5 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v12, 3, v12 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v6, 3, v6 -; VI-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v32, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v6, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v32, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v13, 3, v13 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v13, 3, v13 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v28, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 -; VI-NEXT: v_or_b32_e32 v28, v28, v32 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v33, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v33, v33, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v14, 3, v14 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v27, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v27, 0x300, v27 -; VI-NEXT: v_or_b32_e32 v27, v27, v33 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v34, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v34, v34, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v26, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 -; VI-NEXT: v_or_b32_e32 v26, v26, v34 +; VI-NEXT: v_or_b32_sdwa v17, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v35, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v35, v35, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 -; VI-NEXT: v_or_b32_e32 v25, v25, v35 +; VI-NEXT: v_or_b32_sdwa v16, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v36, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 -; VI-NEXT: v_or_b32_e32 v6, v7, v6 -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v36, v36, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v24, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 -; VI-NEXT: v_or_b32_e32 v24, v24, v36 +; VI-NEXT: v_or_b32_sdwa v36, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_e32 v14, 3, v14 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v37, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v37, v37, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 3, v39 +; VI-NEXT: v_or_b32_sdwa v39, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v14, 3, v38 +; VI-NEXT: v_add_u16_e32 v38, 3, v63 +; VI-NEXT: v_mov_b32_e32 v63, 0x300 +; VI-NEXT: v_add_u16_sdwa v32, v32, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v19, v12, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v32 +; VI-NEXT: v_add_u16_sdwa v20, v11, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v21, v10, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v22, v9, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v23, v8, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v27, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v28, v3, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v29, v2, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v30, v1, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v31, v0, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v33, v33, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v36, v36, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v39, v39, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v36 +; VI-NEXT: v_or_b32_e32 v16, v16, v33 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v14, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v39 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v23, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v7, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 -; VI-NEXT: v_or_b32_e32 v23, v23, v37 +; VI-NEXT: v_or_b32_sdwa v37, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v35, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v34, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v12, v34, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v38, v18, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v34, 3, v34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v8, 3, v8 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 -; VI-NEXT: v_or_b32_e32 v7, v8, v7 -; VI-NEXT: v_add_u16_e32 v8, 3, v63 +; VI-NEXT: v_add_u16_e32 v48, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v9, 3, v62 -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v8, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 -; VI-NEXT: v_or_b32_e32 v8, v9, v8 -; VI-NEXT: v_add_u16_e32 v9, 3, v61 +; VI-NEXT: v_or_b32_sdwa v48, v18, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v11, v48, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v10, 3, v60 -; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v9, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 -; VI-NEXT: v_or_b32_e32 v9, v10, v9 -; VI-NEXT: v_add_u16_e32 v10, 3, v57 +; VI-NEXT: v_add_u16_e32 v49, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v11, 3, v56 -; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v10, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 -; VI-NEXT: v_or_b32_e32 v10, v11, v10 -; VI-NEXT: v_add_u16_e32 v11, 3, v59 +; VI-NEXT: v_or_b32_sdwa v49, v18, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v12, 3, v58 -; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v11, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 -; VI-NEXT: v_or_b32_e32 v11, v12, v11 -; VI-NEXT: v_add_u16_e32 v12, 3, v47 +; VI-NEXT: v_add_u16_e32 v50, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v53, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v50, v18, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v10, v50, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v13, 3, v46 -; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v12, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 -; VI-NEXT: v_or_b32_e32 v12, v13, v12 -; VI-NEXT: v_add_u16_e32 v13, 3, v45 +; VI-NEXT: v_add_u16_e32 v51, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v52, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v24, v7, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v14, 3, v44 -; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v13, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 -; VI-NEXT: v_or_b32_e32 v13, v14, v13 -; VI-NEXT: v_add_u16_e32 v14, 3, v43 +; VI-NEXT: v_or_b32_sdwa v51, v18, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v15, 3, v42 -; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v14, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 -; VI-NEXT: v_or_b32_e32 v14, v15, v14 -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: v_add_u16_e32 v52, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v52, v18, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v9, v52, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v16, 3, v16 -; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 -; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_add_u16_e32 v53, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v53, v18, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v17, 3, v17 -; VI-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: v_add_u16_e32 v54, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v19, 3, v19 -; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v54, v18, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v8, v54, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v20, 3, v20 -; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v21, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v17, v17, v21 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v16, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v19, 0x300, v20 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_e32 v16, v19, v16 -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v55, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v55, v18, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v43, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v21, 3, v21 -; VI-NEXT: v_or_b32_sdwa v30, v39, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v30, 0x300, v30 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u16_e32 v20, 3, v20 -; VI-NEXT: v_or_b32_sdwa v31, v51, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v31, 0x300, v31 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u16_e32 v21, 3, v21 -; VI-NEXT: v_or_b32_sdwa v21, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v40, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v29, v29, v40 +; VI-NEXT: v_add_u16_e32 v40, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v38, v38, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v38, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v40, v18, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v41, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v25, v6, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v7, v40, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v22, 3, v22 -; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v20, 3, v20 -; VI-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v55, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 -; VI-NEXT: v_or_b32_e32 v22, v22, v38 -; VI-NEXT: v_or_b32_e32 v30, v30, v55 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v39, 3, v39 -; VI-NEXT: v_or_b32_sdwa v39, v48, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v21, v39, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v48, 3, v48 -; VI-NEXT: v_or_b32_sdwa v48, v49, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v49, 3, v49 -; VI-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v20, v49, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v50, 3, v50 -; VI-NEXT: v_or_b32_sdwa v50, v51, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v19, 3, v19 -; VI-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v41, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v41, v18, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v45, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u16_e32 v39, 3, v39 -; VI-NEXT: v_or_b32_sdwa v39, v49, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v39, 0x300, v39 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v51, 3, v51 -; VI-NEXT: v_or_b32_sdwa v51, v52, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v52, 3, v52 -; VI-NEXT: v_or_b32_sdwa v52, v53, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v53, 3, v53 -; VI-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_sdwa v54, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v19, v51, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v18, v53, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v18, v39, v18 -; VI-NEXT: v_add_u16_e32 v39, 0x300, v52 -; VI-NEXT: v_or_b32_e32 v19, v39, v19 -; VI-NEXT: v_add_u16_e32 v39, 0x300, v50 -; VI-NEXT: v_or_b32_e32 v20, v39, v20 -; VI-NEXT: v_add_u16_e32 v39, 0x300, v48 -; VI-NEXT: v_or_b32_e32 v21, v39, v21 -; VI-NEXT: v_or_b32_e32 v31, v31, v54 +; VI-NEXT: v_add_u16_e32 v42, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v42, v18, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v44, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v26, v5, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v6, v42, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v43, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v43, v18, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v44, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v44, v18, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v5, v44, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v45, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v45, v18, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v46, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v46, v18, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v4, v46, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v47, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v47, v18, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v56, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v56, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v3, v56, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v57, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v57, v18, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v58, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v58, v18, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v2, v58, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v59, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v59, v18, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v60, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v60, v18, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v1, v60, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v61, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v61, v18, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v62, 3, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v62, v18, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v18, v13, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v13, v37, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v18, v32, v18 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v0, v62, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v34, v37, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v19, v32, v19 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v0, v34, v0 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v61 +; VI-NEXT: v_or_b32_e32 v1, v34, v1 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v59 +; VI-NEXT: v_or_b32_e32 v2, v34, v2 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v57 +; VI-NEXT: v_or_b32_e32 v3, v34, v3 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v47 +; VI-NEXT: v_or_b32_e32 v4, v34, v4 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v45 +; VI-NEXT: v_or_b32_e32 v5, v34, v5 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v43 +; VI-NEXT: v_or_b32_e32 v6, v34, v6 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v41 +; VI-NEXT: v_or_b32_e32 v7, v34, v7 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v55 +; VI-NEXT: v_or_b32_e32 v8, v34, v8 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v53 +; VI-NEXT: v_or_b32_e32 v9, v34, v9 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v51 +; VI-NEXT: v_or_b32_e32 v10, v34, v10 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v49 +; VI-NEXT: v_or_b32_e32 v11, v34, v11 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v38 +; VI-NEXT: v_or_b32_e32 v12, v34, v12 +; VI-NEXT: v_add_u16_e32 v34, 0x300, v35 +; VI-NEXT: v_or_b32_e32 v13, v34, v13 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v20, v32, v20 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v21, v32, v21 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v22, v32, v22 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v23, v32, v23 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v24, v32, v24 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v25, v32, v25 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v26, v32, v26 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v27, v32, v27 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v28, v32, v28 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v29, v32, v29 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v30, v32, v30 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: v_or_b32_e32 v31, v32, v31 ; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload @@ -200439,22 +199824,22 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 @@ -200473,192 +199858,186 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v1 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:184 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v9 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v11 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v15 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v17 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v23 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v25 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v11 -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v13 -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v15 -; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v17 -; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v19 -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v23 -; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v25 -; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(39) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v37 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v48 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v33 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v49 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v52 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v34 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v35 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:156 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v53 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v42 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v43 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v38 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 @@ -200666,26 +200045,24 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 @@ -200693,460 +200070,373 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:324 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:340 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v47, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v3 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:348 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:356 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:364 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:372 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v0 ; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v1 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:44 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:84 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB96_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_or_b32_sdwa v28, v38, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v48, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v54, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v40, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v32, v45, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v21, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v62, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v10, v57, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v59, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v47, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v13, v63, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v14, v34, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v43, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v16, v17, v16, s6 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v17, v19, v18, s6 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v18, v35, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v17, v18, v17, s6 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v18, v19, v18, s6 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v36, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v19, v20, v19, s6 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v39, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v20, v21, v20, s6 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v21, v22, v21, s6 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v22, v23, v22, s6 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v49, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v52, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; kill: killed $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v30, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v41, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v30, v31, v30, s6 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; kill: killed $vgpr32 @@ -201249,403 +200539,537 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; kill: killed $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: .LBB96_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB96_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u16_e32 v0, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v30, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v25, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v26, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v27, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v28, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v49 +; GFX9-NEXT: v_mov_b32_e32 v1, v35 +; GFX9-NEXT: v_or_b32_sdwa v35, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v48 +; GFX9-NEXT: v_mov_b32_e32 v2, v36 +; GFX9-NEXT: v_or_b32_sdwa v36, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v3, v37 +; GFX9-NEXT: v_mov_b32_e32 v4, v39 ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v2 -; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v3 -; GFX9-NEXT: v_perm_b32 v0, v2, v0, s6 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v37, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v39, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 -; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s6 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 -; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v38, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 -; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 -; GFX9-NEXT: v_add_u16_e32 v35, 0x300, v25 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v48, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v38, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v23, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v24, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v18, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v19, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v20, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v21, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v36, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v22 -; GFX9-NEXT: v_add_u16_e32 v36, 0x300, v36 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v22, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 -; GFX9-NEXT: v_or_b32_sdwa v6, v33, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 -; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v23, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v37, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v48, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v33, 0x300, v21 -; GFX9-NEXT: v_add_u16_e32 v34, 0x300, v23 -; GFX9-NEXT: v_perm_b32 v29, v34, v29, s6 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 -; GFX9-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_or_b32_sdwa v49, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v49, 0x300, v49 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v50, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v38, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v51, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v52, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v53, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v39, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 -; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v39 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v54, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v48, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v40, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 -; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 -; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 -; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v41, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v43, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v45, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v46, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v47, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v56, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v57, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v58, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v34, 3, v34 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 -; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 -; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 -; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v60 -; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 -; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 -; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 -; GFX9-NEXT: v_add_u16_e32 v10, 3, v57 +; GFX9-NEXT: v_or_b32_sdwa v59, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 3, v63 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v56 -; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 -; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 -; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 -; GFX9-NEXT: v_add_u16_e32 v11, 3, v59 +; GFX9-NEXT: v_or_b32_sdwa v60, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v58 -; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 -; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 -; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 -; GFX9-NEXT: v_add_u16_e32 v12, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v61, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v46 -; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 -; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 -; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 -; GFX9-NEXT: v_add_u16_e32 v13, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v62, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 -; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 -; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: v_add_u16_e32 v14, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v63, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v15, 3, v42 -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 -; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 -; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v17, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: v_or_b32_sdwa v16, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 -; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 -; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 +; GFX9-NEXT: v_or_b32_sdwa v11, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v16 -; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v18 -; GFX9-NEXT: v_perm_b32 v17, v17, v20, s6 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v19 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v16, v18, v16, s6 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v49, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v20 -; GFX9-NEXT: v_perm_b32 v30, v33, v30, s6 +; GFX9-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v50, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v52, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v39, 0x300, v50 -; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v18 +; GFX9-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v51, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v51 +; GFX9-NEXT: v_or_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v52, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v32, 0x300, v19 -; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 +; GFX9-NEXT: v_or_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v53, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v53 +; GFX9-NEXT: v_or_b32_sdwa v7, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v54, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v6, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_perm_b32 v6, v6, v7, s6 +; GFX9-NEXT: v_perm_b32 v7, v8, v9, s6 +; GFX9-NEXT: v_perm_b32 v8, v14, v15, s6 +; GFX9-NEXT: v_perm_b32 v9, v10, v11, s6 +; GFX9-NEXT: v_perm_b32 v10, v12, v13, s6 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v55, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v55 +; GFX9-NEXT: v_or_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v40, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v50, 0x300, v40 -; GFX9-NEXT: v_perm_b32 v21, v50, v21, s6 +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v41, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v41 +; GFX9-NEXT: v_or_b32_sdwa v42, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v42, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v51, 0x300, v42 -; GFX9-NEXT: v_perm_b32 v20, v51, v20, s6 +; GFX9-NEXT: v_or_b32_sdwa v44, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v43, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v43 +; GFX9-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v44, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 +; GFX9-NEXT: v_perm_b32 v3, v31, v3, s6 +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v27 +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v23 +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v50 +; GFX9-NEXT: v_add_u16_e32 v50, 0x300, v51 +; GFX9-NEXT: v_add_u16_e32 v51, 0x300, v53 +; GFX9-NEXT: v_add_u16_e32 v53, 0x300, v41 +; GFX9-NEXT: v_add_u16_e32 v41, 0x300, v59 +; GFX9-NEXT: v_add_u16_e32 v59, 0x300, v60 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v45, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v45 +; GFX9-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v32, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v32, 0x300, v32 +; GFX9-NEXT: v_perm_b32 v2, v32, v2, s6 +; GFX9-NEXT: v_add_u16_e32 v32, 0x300, v25 +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v20 +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v40 +; GFX9-NEXT: v_add_u16_e32 v40, 0x300, v57 +; GFX9-NEXT: v_add_u16_e32 v57, 0x300, v17 +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v44 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v33, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v33, 0x300, v33 +; GFX9-NEXT: v_perm_b32 v1, v33, v1, s6 +; GFX9-NEXT: v_add_u16_e32 v33, 0x300, v26 +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v18 +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v46 +; GFX9-NEXT: v_add_u16_e32 v46, 0x300, v16 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v4 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v42 +; GFX9-NEXT: v_perm_b32 v4, v17, v4, s6 +; GFX9-NEXT: v_perm_b32 v5, v16, v5, s6 +; GFX9-NEXT: v_perm_b32 v11, v46, v57, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v46, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v24 -; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v26 -; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v37 +; GFX9-NEXT: v_or_b32_sdwa v34, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v34, 0x300, v34 +; GFX9-NEXT: v_perm_b32 v0, v34, v0, s6 +; GFX9-NEXT: v_add_u16_e32 v34, 0x300, v28 +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v48 +; GFX9-NEXT: v_add_u16_e32 v48, 0x300, v21 +; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v54 +; GFX9-NEXT: v_add_u16_e32 v54, 0x300, v45 +; GFX9-NEXT: v_add_u16_e32 v45, 0x300, v63 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v30 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v35 +; GFX9-NEXT: v_add_u16_e32 v35, 0x300, v36 +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v37 +; GFX9-NEXT: v_add_u16_e32 v36, 0x300, v39 ; GFX9-NEXT: v_add_u16_e32 v37, 0x300, v38 -; GFX9-NEXT: v_add_u16_e32 v38, 0x300, v48 -; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v49 -; GFX9-NEXT: v_add_u16_e32 v48, 0x300, v52 -; GFX9-NEXT: v_add_u16_e32 v49, 0x300, v54 -; GFX9-NEXT: v_add_u16_e32 v52, 0x300, v44 -; GFX9-NEXT: v_add_u16_e32 v53, 0x300, v46 -; GFX9-NEXT: v_perm_b32 v18, v53, v18, s6 -; GFX9-NEXT: v_perm_b32 v19, v52, v19, s6 -; GFX9-NEXT: v_perm_b32 v22, v49, v22, s6 -; GFX9-NEXT: v_perm_b32 v23, v48, v23, s6 -; GFX9-NEXT: v_perm_b32 v24, v39, v24, s6 -; GFX9-NEXT: v_perm_b32 v25, v38, v25, s6 -; GFX9-NEXT: v_perm_b32 v26, v37, v26, s6 -; GFX9-NEXT: v_perm_b32 v27, v36, v27, s6 -; GFX9-NEXT: v_perm_b32 v28, v35, v28, s6 +; GFX9-NEXT: v_add_u16_e32 v38, 0x300, v24 +; GFX9-NEXT: v_add_u16_e32 v39, 0x300, v19 +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v22 +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v52 +; GFX9-NEXT: v_add_u16_e32 v52, 0x300, v55 +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v43 +; GFX9-NEXT: v_add_u16_e32 v55, 0x300, v47 +; GFX9-NEXT: v_perm_b32 v16, v55, v18, s6 +; GFX9-NEXT: v_perm_b32 v17, v54, v19, s6 +; GFX9-NEXT: v_perm_b32 v18, v53, v20, s6 +; GFX9-NEXT: v_perm_b32 v19, v52, v21, s6 +; GFX9-NEXT: v_perm_b32 v20, v51, v22, s6 +; GFX9-NEXT: v_perm_b32 v21, v50, v23, s6 +; GFX9-NEXT: v_perm_b32 v22, v49, v24, s6 +; GFX9-NEXT: v_perm_b32 v23, v48, v25, s6 +; GFX9-NEXT: v_perm_b32 v24, v39, v26, s6 +; GFX9-NEXT: v_perm_b32 v25, v38, v27, s6 +; GFX9-NEXT: v_perm_b32 v26, v37, v28, s6 +; GFX9-NEXT: v_perm_b32 v27, v36, v29, s6 +; GFX9-NEXT: v_perm_b32 v28, v35, v30, s6 +; GFX9-NEXT: v_perm_b32 v29, v34, v31, s6 +; GFX9-NEXT: v_perm_b32 v30, v33, v32, s6 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v47, 0x300, v56 +; GFX9-NEXT: v_add_u16_e32 v56, 0x300, v58 +; GFX9-NEXT: v_add_u16_e32 v43, 0x300, v61 +; GFX9-NEXT: v_add_u16_e32 v58, 0x300, v62 +; GFX9-NEXT: v_perm_b32 v12, v45, v58, s6 +; GFX9-NEXT: v_perm_b32 v13, v43, v59, s6 +; GFX9-NEXT: v_perm_b32 v14, v41, v56, s6 +; GFX9-NEXT: v_perm_b32 v15, v40, v47, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 ; GFX9-NEXT: .LBB96_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload @@ -203327,6 +202751,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 @@ -203336,24 +202761,24 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 -; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_writelane_b32 v41, s30, 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v43, s29, 0 -; SI-NEXT: v_writelane_b32 v43, s28, 1 -; SI-NEXT: v_writelane_b32 v43, s27, 2 -; SI-NEXT: v_writelane_b32 v43, s26, 3 -; SI-NEXT: v_writelane_b32 v43, s25, 4 -; SI-NEXT: v_writelane_b32 v43, s24, 5 -; SI-NEXT: v_writelane_b32 v43, s23, 6 -; SI-NEXT: v_writelane_b32 v43, s22, 7 -; SI-NEXT: v_writelane_b32 v43, s21, 8 -; SI-NEXT: v_writelane_b32 v43, s20, 9 -; SI-NEXT: v_writelane_b32 v43, s19, 10 -; SI-NEXT: v_writelane_b32 v43, s18, 11 -; SI-NEXT: v_writelane_b32 v43, s17, 12 -; SI-NEXT: v_writelane_b32 v43, s16, 13 +; SI-NEXT: v_writelane_b32 v44, s29, 0 +; SI-NEXT: v_writelane_b32 v44, s28, 1 +; SI-NEXT: v_writelane_b32 v44, s27, 2 +; SI-NEXT: v_writelane_b32 v44, s26, 3 +; SI-NEXT: v_writelane_b32 v44, s25, 4 +; SI-NEXT: v_writelane_b32 v44, s24, 5 +; SI-NEXT: v_writelane_b32 v44, s23, 6 +; SI-NEXT: v_writelane_b32 v44, s22, 7 +; SI-NEXT: v_writelane_b32 v44, s21, 8 +; SI-NEXT: v_writelane_b32 v44, s20, 9 +; SI-NEXT: v_writelane_b32 v44, s19, 10 +; SI-NEXT: v_writelane_b32 v44, s18, 11 +; SI-NEXT: v_writelane_b32 v44, s17, 12 +; SI-NEXT: v_writelane_b32 v44, s16, 13 ; SI-NEXT: v_writelane_b32 v41, s31, 1 ; SI-NEXT: v_writelane_b32 v41, s34, 2 ; SI-NEXT: v_writelane_b32 v41, s35, 3 @@ -203388,97 +202813,95 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s96, 32 ; SI-NEXT: v_writelane_b32 v41, s97, 33 ; SI-NEXT: v_writelane_b32 v41, s98, 34 +; SI-NEXT: v_writelane_b32 v41, s99, 35 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 -; SI-NEXT: v_readfirstlane_b32 s39, v26 -; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s47, v12 -; SI-NEXT: v_writelane_b32 v42, s39, 0 -; SI-NEXT: v_readfirstlane_b32 s56, v11 -; SI-NEXT: v_writelane_b32 v42, s47, 1 -; SI-NEXT: v_readfirstlane_b32 s48, v24 -; SI-NEXT: v_writelane_b32 v42, s56, 2 -; SI-NEXT: v_readfirstlane_b32 s49, v23 -; SI-NEXT: v_writelane_b32 v42, s48, 3 -; SI-NEXT: v_readfirstlane_b32 s50, v21 -; SI-NEXT: v_writelane_b32 v42, s49, 4 -; SI-NEXT: v_readfirstlane_b32 s51, v22 -; SI-NEXT: v_writelane_b32 v42, s50, 5 -; SI-NEXT: v_writelane_b32 v42, s51, 6 -; SI-NEXT: v_readfirstlane_b32 s57, v20 -; SI-NEXT: v_readfirstlane_b32 s58, v19 -; SI-NEXT: v_readfirstlane_b32 s64, v29 -; SI-NEXT: v_readfirstlane_b32 s65, v30 -; SI-NEXT: v_readfirstlane_b32 s59, v28 -; SI-NEXT: v_readfirstlane_b32 s60, v27 -; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s93, v16 +; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s31, v15 +; SI-NEXT: v_writelane_b32 v43, s93, 0 +; SI-NEXT: v_readfirstlane_b32 s35, v25 +; SI-NEXT: v_writelane_b32 v43, s31, 1 +; SI-NEXT: v_readfirstlane_b32 s37, v26 +; SI-NEXT: v_writelane_b32 v43, s35, 2 +; SI-NEXT: v_readfirstlane_b32 s9, v12 +; SI-NEXT: v_writelane_b32 v43, s37, 3 +; SI-NEXT: v_readfirstlane_b32 s11, v11 +; SI-NEXT: v_writelane_b32 v43, s9, 4 +; SI-NEXT: v_readfirstlane_b32 s51, v24 +; SI-NEXT: v_writelane_b32 v43, s11, 5 +; SI-NEXT: v_readfirstlane_b32 s12, v21 +; SI-NEXT: v_writelane_b32 v43, s51, 6 +; SI-NEXT: v_readfirstlane_b32 s14, v22 +; SI-NEXT: v_writelane_b32 v43, s12, 7 +; SI-NEXT: v_writelane_b32 v43, s14, 8 +; SI-NEXT: v_readfirstlane_b32 s40, v20 +; SI-NEXT: v_readfirstlane_b32 s42, v19 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 +; SI-NEXT: v_writelane_b32 v44, s4, 14 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_writelane_b32 v43, s4, 15 -; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: v_readfirstlane_b32 s6, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v43, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v43, s4, 17 +; SI-NEXT: v_writelane_b32 v44, s4, 15 ; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 18 +; SI-NEXT: v_writelane_b32 v44, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s44, v36 -; SI-NEXT: v_readfirstlane_b32 s90, v37 +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_writelane_b32 v44, s4, 17 +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v44, s4, 18 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s16, v38 +; SI-NEXT: v_readfirstlane_b32 s70, v34 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 +; SI-NEXT: v_readfirstlane_b32 s76, v29 +; SI-NEXT: v_readfirstlane_b32 s78, v30 +; SI-NEXT: v_readfirstlane_b32 s50, v27 +; SI-NEXT: v_readfirstlane_b32 s28, v1 +; SI-NEXT: v_readfirstlane_b32 s43, v2 +; SI-NEXT: v_readfirstlane_b32 s75, v9 +; SI-NEXT: v_readfirstlane_b32 s77, v10 +; SI-NEXT: v_readfirstlane_b32 s79, v8 +; SI-NEXT: v_readfirstlane_b32 s88, v7 +; SI-NEXT: v_readfirstlane_b32 s89, v17 +; SI-NEXT: v_readfirstlane_b32 s80, v23 +; SI-NEXT: v_readfirstlane_b32 s91, v18 +; SI-NEXT: v_readfirstlane_b32 s98, v14 +; SI-NEXT: v_readfirstlane_b32 s96, v13 +; SI-NEXT: v_readfirstlane_b32 s82, v6 +; SI-NEXT: v_readfirstlane_b32 s65, v5 +; SI-NEXT: v_readfirstlane_b32 s84, v4 +; SI-NEXT: v_readfirstlane_b32 s86, v3 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s6, v38 -; SI-NEXT: v_readfirstlane_b32 s12, v2 -; SI-NEXT: v_readfirstlane_b32 s13, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s15, v8 -; SI-NEXT: v_readfirstlane_b32 s18, v7 -; SI-NEXT: v_readfirstlane_b32 s21, v5 -; SI-NEXT: v_readfirstlane_b32 s22, v6 -; SI-NEXT: v_readfirstlane_b32 s40, v17 -; SI-NEXT: v_readfirstlane_b32 s41, v18 -; SI-NEXT: v_readfirstlane_b32 s42, v4 -; SI-NEXT: v_readfirstlane_b32 s43, v3 -; SI-NEXT: v_readfirstlane_b32 s76, v16 -; SI-NEXT: v_readfirstlane_b32 s77, v15 -; SI-NEXT: v_readfirstlane_b32 s38, v25 -; SI-NEXT: v_writelane_b32 v41, s99, 35 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_readfirstlane_b32 s93, v55 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s95, v40 +; SI-NEXT: v_readfirstlane_b32 s81, v53 +; SI-NEXT: v_readfirstlane_b32 s20, v54 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 19 +; SI-NEXT: v_writelane_b32 v44, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 20 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v43, s4, 21 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v43, s4, 22 +; SI-NEXT: v_writelane_b32 v44, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v43, s4, 23 +; SI-NEXT: v_readfirstlane_b32 s44, v48 +; SI-NEXT: v_readfirstlane_b32 s57, v49 +; SI-NEXT: v_writelane_b32 v44, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 @@ -203489,43 +202912,41 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s91, v32 +; SI-NEXT: v_readfirstlane_b32 s22, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s8, v33 +; SI-NEXT: v_readfirstlane_b32 s59, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v43, s4, 24 +; SI-NEXT: v_writelane_b32 v44, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v43, s4, 25 +; SI-NEXT: v_readfirstlane_b32 s61, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 26 +; SI-NEXT: v_readfirstlane_b32 s73, v35 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v43, s4, 27 +; SI-NEXT: v_readfirstlane_b32 s55, v36 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_readfirstlane_b32 s25, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v43, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 29 +; SI-NEXT: v_writelane_b32 v44, s4, 23 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s89, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v44, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s78, v39 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s7, v48 +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_writelane_b32 v44, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s82, v49 +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_readfirstlane_b32 s47, v48 +; SI-NEXT: v_writelane_b32 v44, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v50 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s96, v51 +; SI-NEXT: v_readfirstlane_b32 s62, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 @@ -203533,39 +202954,36 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: v_writelane_b32 v44, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s70, v33 +; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 -; SI-NEXT: v_writelane_b32 v43, s4, 30 -; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_writelane_b32 v43, s4, 31 +; SI-NEXT: v_writelane_b32 v44, s4, 28 +; SI-NEXT: v_readfirstlane_b32 s15, v32 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v43, s4, 32 +; SI-NEXT: v_readfirstlane_b32 s8, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s9, v35 +; SI-NEXT: v_readfirstlane_b32 s10, v35 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s63, v36 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_writelane_b32 v43, s4, 33 -; SI-NEXT: v_readfirstlane_b32 s10, v36 +; SI-NEXT: v_readfirstlane_b32 s30, v37 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 34 +; SI-NEXT: v_readfirstlane_b32 s83, v31 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v43, s4, 35 +; SI-NEXT: v_readfirstlane_b32 s72, v38 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 36 +; SI-NEXT: v_writelane_b32 v44, s4, 29 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s69, v48 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s30, v49 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s16, v50 +; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_writelane_b32 v44, s4, 30 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s36, v51 +; SI-NEXT: v_readfirstlane_b32 s4, v51 +; SI-NEXT: v_readfirstlane_b32 s85, v49 +; SI-NEXT: v_readfirstlane_b32 s97, v50 +; SI-NEXT: v_writelane_b32 v44, s4, 31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(3) @@ -203581,50 +202999,58 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: v_writelane_b32 v43, s4, 37 +; SI-NEXT: v_writelane_b32 v44, s4, 32 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: v_writelane_b32 v43, s4, 38 -; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: v_writelane_b32 v43, s4, 39 -; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: v_writelane_b32 v43, s4, 40 -; SI-NEXT: v_writelane_b32 v43, s44, 41 -; SI-NEXT: v_writelane_b32 v43, s6, 42 -; SI-NEXT: v_writelane_b32 v43, s7, 43 -; SI-NEXT: v_writelane_b32 v43, s8, 44 -; SI-NEXT: v_writelane_b32 v43, s9, 45 -; SI-NEXT: v_writelane_b32 v43, s10, 46 -; SI-NEXT: v_writelane_b32 v43, s11, 47 -; SI-NEXT: v_writelane_b32 v43, s12, 48 -; SI-NEXT: v_writelane_b32 v43, s13, 49 -; SI-NEXT: v_writelane_b32 v43, s14, 50 -; SI-NEXT: v_writelane_b32 v43, s15, 51 -; SI-NEXT: v_writelane_b32 v43, s18, 52 -; SI-NEXT: v_writelane_b32 v43, s21, 53 -; SI-NEXT: v_writelane_b32 v43, s22, 54 -; SI-NEXT: v_writelane_b32 v43, s40, 55 -; SI-NEXT: v_writelane_b32 v43, s41, 56 -; SI-NEXT: v_writelane_b32 v43, s42, 57 -; SI-NEXT: v_writelane_b32 v43, s43, 58 -; SI-NEXT: v_writelane_b32 v43, s76, 59 -; SI-NEXT: v_writelane_b32 v43, s77, 60 +; SI-NEXT: v_writelane_b32 v44, s4, 33 +; SI-NEXT: v_readfirstlane_b32 s4, v55 +; SI-NEXT: v_writelane_b32 v44, s4, 34 +; SI-NEXT: v_readfirstlane_b32 s4, v40 +; SI-NEXT: v_writelane_b32 v44, s4, 35 +; SI-NEXT: v_writelane_b32 v44, s44, 36 +; SI-NEXT: v_writelane_b32 v44, s62, 37 +; SI-NEXT: v_writelane_b32 v44, s73, 38 +; SI-NEXT: v_writelane_b32 v44, s61, 39 +; SI-NEXT: v_writelane_b32 v44, s57, 40 +; SI-NEXT: v_writelane_b32 v44, s30, 41 +; SI-NEXT: v_writelane_b32 v44, s55, 42 +; SI-NEXT: v_writelane_b32 v44, s6, 43 +; SI-NEXT: v_writelane_b32 v44, s25, 44 +; SI-NEXT: v_writelane_b32 v44, s83, 45 +; SI-NEXT: v_writelane_b32 v44, s81, 46 +; SI-NEXT: v_writelane_b32 v44, s20, 47 +; SI-NEXT: v_writelane_b32 v44, s72, 48 +; SI-NEXT: v_writelane_b32 v44, s22, 49 +; SI-NEXT: v_writelane_b32 v44, s16, 50 +; SI-NEXT: v_writelane_b32 v44, s28, 51 +; SI-NEXT: v_writelane_b32 v44, s43, 52 +; SI-NEXT: v_writelane_b32 v44, s8, 53 +; SI-NEXT: v_writelane_b32 v44, s47, 54 +; SI-NEXT: v_writelane_b32 v44, s59, 55 +; SI-NEXT: v_writelane_b32 v44, s10, 56 +; SI-NEXT: v_writelane_b32 v44, s63, 57 +; SI-NEXT: v_writelane_b32 v44, s75, 58 +; SI-NEXT: v_writelane_b32 v44, s77, 59 +; SI-NEXT: v_writelane_b32 v44, s79, 60 +; SI-NEXT: v_writelane_b32 v44, s88, 61 +; SI-NEXT: v_writelane_b32 v44, s89, 62 +; SI-NEXT: v_writelane_b32 v44, s91, 63 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s17, v33 +; SI-NEXT: v_readfirstlane_b32 s92, v33 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s98, v34 +; SI-NEXT: v_readfirstlane_b32 s13, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s23, v35 -; SI-NEXT: v_readfirstlane_b32 s25, v31 -; SI-NEXT: v_readfirstlane_b32 s28, v32 +; SI-NEXT: v_readfirstlane_b32 s26, v35 +; SI-NEXT: v_readfirstlane_b32 s71, v31 +; SI-NEXT: v_readfirstlane_b32 s69, v32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s26, v36 +; SI-NEXT: v_readfirstlane_b32 s94, v36 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s88, v37 +; SI-NEXT: v_readfirstlane_b32 s90, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s79, v38 +; SI-NEXT: v_readfirstlane_b32 s95, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s75, v39 +; SI-NEXT: v_readfirstlane_b32 s99, v39 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 @@ -203637,39 +203063,35 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s24, v49 +; SI-NEXT: v_readfirstlane_b32 s49, v49 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s85, v50 +; SI-NEXT: v_readfirstlane_b32 s45, v50 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s66, v51 +; SI-NEXT: v_readfirstlane_b32 s46, v51 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 vcc_lo, v13 -; SI-NEXT: v_readfirstlane_b32 vcc_hi, v14 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 61 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 62 -; SI-NEXT: v_writelane_b32 v43, s38, 63 +; SI-NEXT: v_readfirstlane_b32 vcc_hi, v28 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s20, v31 +; SI-NEXT: v_readfirstlane_b32 s60, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s19, v32 +; SI-NEXT: v_readfirstlane_b32 s74, v32 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s27, v33 +; SI-NEXT: v_readfirstlane_b32 s17, v33 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s94, v34 +; SI-NEXT: v_readfirstlane_b32 s87, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s72, v35 +; SI-NEXT: v_readfirstlane_b32 s18, v35 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s73, v36 +; SI-NEXT: v_readfirstlane_b32 s7, v36 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s67, v37 +; SI-NEXT: v_readfirstlane_b32 s36, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s71, v38 +; SI-NEXT: v_readfirstlane_b32 s29, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s97, v39 +; SI-NEXT: v_readfirstlane_b32 s24, v39 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 @@ -203679,509 +203101,513 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s35, v48 +; SI-NEXT: v_readfirstlane_b32 s58, v48 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s83, v49 +; SI-NEXT: v_readfirstlane_b32 s67, v49 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s87, v50 +; SI-NEXT: v_readfirstlane_b32 s41, v50 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s63, v51 +; SI-NEXT: v_readfirstlane_b32 s21, v51 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s74, v31 +; SI-NEXT: v_readfirstlane_b32 s27, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s81, v32 +; SI-NEXT: v_readfirstlane_b32 s23, v32 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s80, v33 +; SI-NEXT: v_readfirstlane_b32 s34, v33 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s86, v34 +; SI-NEXT: v_readfirstlane_b32 s64, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s34, v35 +; SI-NEXT: v_readfirstlane_b32 s56, v35 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s84, v36 +; SI-NEXT: v_readfirstlane_b32 s53, v36 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s31, v37 +; SI-NEXT: v_readfirstlane_b32 s52, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s61, v38 +; SI-NEXT: v_readfirstlane_b32 s54, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s62, v39 +; SI-NEXT: v_readfirstlane_b32 s66, v39 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s53, v48 +; SI-NEXT: v_readfirstlane_b32 s48, v48 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s52, v49 -; SI-NEXT: v_writelane_b32 v42, s52, 7 -; SI-NEXT: v_writelane_b32 v42, s53, 8 -; SI-NEXT: v_writelane_b32 v42, s57, 9 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s54, v50 -; SI-NEXT: v_writelane_b32 v42, s58, 10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s55, v51 -; SI-NEXT: v_writelane_b32 v42, s54, 11 -; SI-NEXT: v_writelane_b32 v42, s55, 12 -; SI-NEXT: v_writelane_b32 v42, s64, 13 -; SI-NEXT: v_writelane_b32 v42, s65, 14 -; SI-NEXT: v_writelane_b32 v42, s67, 15 -; SI-NEXT: v_writelane_b32 v42, s71, 16 -; SI-NEXT: v_writelane_b32 v42, s80, 17 -; SI-NEXT: v_writelane_b32 v42, s81, 18 -; SI-NEXT: v_writelane_b32 v42, s59, 19 -; SI-NEXT: v_writelane_b32 v42, s60, 20 -; SI-NEXT: v_writelane_b32 v42, s86, 21 -; SI-NEXT: v_writelane_b32 v42, s97, 22 -; SI-NEXT: v_writelane_b32 v42, s34, 23 -; SI-NEXT: v_writelane_b32 v42, s66, 24 -; SI-NEXT: v_writelane_b32 v42, s85, 25 -; SI-NEXT: v_writelane_b32 v42, s31, 26 -; SI-NEXT: v_writelane_b32 v42, s84, 27 -; SI-NEXT: v_writelane_b32 v42, s35, 28 -; SI-NEXT: v_writelane_b32 v42, s98, 29 -; SI-NEXT: v_writelane_b32 v42, s17, 30 -; SI-NEXT: v_writelane_b32 v42, s20, 31 -; SI-NEXT: v_writelane_b32 v42, s61, 32 -; SI-NEXT: v_writelane_b32 v42, s19, 33 -; SI-NEXT: v_writelane_b32 v42, s62, 34 -; SI-NEXT: v_writelane_b32 v42, s23, 35 -; SI-NEXT: v_writelane_b32 v42, s83, 36 -; SI-NEXT: v_writelane_b32 v42, s87, 37 -; SI-NEXT: v_writelane_b32 v42, s26, 38 -; SI-NEXT: v_writelane_b32 v42, s94, 39 -; SI-NEXT: v_writelane_b32 v42, s27, 40 -; SI-NEXT: v_writelane_b32 v42, s63, 41 -; SI-NEXT: v_writelane_b32 v42, s79, 42 -; SI-NEXT: v_writelane_b32 v42, s88, 43 -; SI-NEXT: v_writelane_b32 v42, s72, 44 -; SI-NEXT: v_writelane_b32 v42, s73, 45 -; SI-NEXT: v_writelane_b32 v42, s74, 46 -; SI-NEXT: v_writelane_b32 v42, s75, 47 -; SI-NEXT: v_writelane_b32 v42, s24, 48 -; SI-NEXT: v_writelane_b32 v42, s25, 49 -; SI-NEXT: v_writelane_b32 v42, s28, 50 +; SI-NEXT: v_readfirstlane_b32 s19, v49 +; SI-NEXT: v_writelane_b32 v43, s19, 9 +; SI-NEXT: v_writelane_b32 v43, s48, 10 +; SI-NEXT: v_writelane_b32 v43, s40, 11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s38, v50 +; SI-NEXT: v_writelane_b32 v43, s42, 12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s39, v51 +; SI-NEXT: v_writelane_b32 v43, s38, 13 +; SI-NEXT: v_writelane_b32 v43, s39, 14 +; SI-NEXT: v_writelane_b32 v43, s76, 15 +; SI-NEXT: v_writelane_b32 v43, s78, 16 +; SI-NEXT: v_writelane_b32 v43, s36, 17 +; SI-NEXT: v_writelane_b32 v43, s29, 18 +; SI-NEXT: v_writelane_b32 v43, s34, 19 +; SI-NEXT: v_writelane_b32 v43, s23, 20 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 21 +; SI-NEXT: v_writelane_b32 v43, s50, 22 +; SI-NEXT: v_writelane_b32 v43, s64, 23 +; SI-NEXT: v_writelane_b32 v43, s24, 24 +; SI-NEXT: v_writelane_b32 v43, s56, 25 +; SI-NEXT: v_writelane_b32 v43, s46, 26 +; SI-NEXT: v_writelane_b32 v43, s45, 27 +; SI-NEXT: v_writelane_b32 v43, s52, 28 +; SI-NEXT: v_writelane_b32 v43, s53, 29 +; SI-NEXT: v_writelane_b32 v43, s58, 30 +; SI-NEXT: v_writelane_b32 v43, s60, 31 +; SI-NEXT: v_writelane_b32 v43, s54, 32 +; SI-NEXT: v_writelane_b32 v43, s74, 33 +; SI-NEXT: v_writelane_b32 v43, s66, 34 +; SI-NEXT: v_writelane_b32 v43, s67, 35 +; SI-NEXT: v_writelane_b32 v43, s41, 36 +; SI-NEXT: v_writelane_b32 v43, s13, 37 +; SI-NEXT: v_writelane_b32 v43, s92, 38 +; SI-NEXT: v_writelane_b32 v43, s21, 39 +; SI-NEXT: v_writelane_b32 v43, s26, 40 +; SI-NEXT: v_writelane_b32 v43, s94, 41 +; SI-NEXT: v_writelane_b32 v43, s27, 42 +; SI-NEXT: v_writelane_b32 v43, s87, 43 +; SI-NEXT: v_writelane_b32 v43, s17, 44 +; SI-NEXT: v_writelane_b32 v43, s18, 45 +; SI-NEXT: v_writelane_b32 v43, s7, 46 +; SI-NEXT: v_writelane_b32 v43, s95, 47 +; SI-NEXT: v_writelane_b32 v43, s90, 48 +; SI-NEXT: v_writelane_b32 v43, s99, 49 +; SI-NEXT: v_writelane_b32 v43, s49, 50 +; SI-NEXT: v_writelane_b32 v43, s71, 51 +; SI-NEXT: v_writelane_b32 v43, s69, 52 ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_readlane_b32 s4, v43, 13 -; SI-NEXT: v_readlane_b32 s5, v43, 12 +; SI-NEXT: v_readlane_b32 s4, v44, 13 +; SI-NEXT: v_readlane_b32 s5, v44, 12 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_writelane_b32 v43, s4, 57 +; SI-NEXT: v_readlane_b32 s4, v44, 11 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: v_readlane_b32 s6, v44, 10 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_or_b32 vcc_lo, s6, s4 +; SI-NEXT: v_readlane_b32 s4, v44, 5 +; SI-NEXT: v_readlane_b32 s6, v44, 4 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: v_writelane_b32 v43, s70, 59 +; SI-NEXT: v_writelane_b32 v43, s80, 60 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: v_writelane_b32 v43, s4, 55 +; SI-NEXT: v_readlane_b32 s4, v44, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s6, v44, 2 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_or_b32 s68, s6, s4 +; SI-NEXT: s_and_b32 s4, s86, 0xff +; SI-NEXT: s_lshl_b32 s6, s84, 8 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: v_writelane_b32 v43, s4, 53 +; SI-NEXT: s_and_b32 s4, s65, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s82, 24 +; SI-NEXT: s_or_b32 s70, s6, s4 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s6, s9, 8 +; SI-NEXT: s_or_b32 s11, s4, s6 +; SI-NEXT: s_and_b32 s4, s96, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s98, 24 +; SI-NEXT: s_or_b32 s80, s6, s4 +; SI-NEXT: s_and_b32 s4, s42, 0xff +; SI-NEXT: s_lshl_b32 s6, s40, 8 +; SI-NEXT: s_or_b32 s9, s4, s6 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: v_writelane_b32 v43, s15, 61 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s14, 24 +; SI-NEXT: v_writelane_b32 v43, s82, 62 +; SI-NEXT: s_or_b32 s82, s6, s4 +; SI-NEXT: s_and_b32 s4, s50, 0xff +; SI-NEXT: s_lshl_b32 s6, vcc_hi, 8 +; SI-NEXT: s_or_b32 s15, s4, s6 +; SI-NEXT: s_and_b32 s4, s76, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s78, 24 +; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: v_writelane_b32 v43, s65, 63 +; SI-NEXT: v_writelane_b32 v42, s84, 0 +; SI-NEXT: s_or_b32 s84, s6, s4 +; SI-NEXT: s_and_b32 s4, s66, 0xff +; SI-NEXT: s_lshl_b32 s6, s54, 8 +; SI-NEXT: s_or_b32 s78, s4, s6 +; SI-NEXT: s_and_b32 s4, s52, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s53, 24 +; SI-NEXT: v_writelane_b32 v42, s86, 1 +; SI-NEXT: s_or_b32 s86, s6, s4 +; SI-NEXT: s_and_b32 s4, s27, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s42, s4, s6 +; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s67, 24 +; SI-NEXT: s_mov_b32 s65, s96 +; SI-NEXT: s_or_b32 s96, s6, s4 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s6, s18, 8 +; SI-NEXT: s_or_b32 s52, s4, s6 +; SI-NEXT: s_and_b32 s4, s87, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s17, 24 +; SI-NEXT: s_mov_b32 s7, s98 +; SI-NEXT: s_or_b32 s98, s6, s4 +; SI-NEXT: s_and_b32 s4, s49, 0xff +; SI-NEXT: s_lshl_b32 s6, s99, 8 +; SI-NEXT: s_or_b32 s76, s4, s6 +; SI-NEXT: s_and_b32 s4, s95, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s90, 24 +; SI-NEXT: s_or_b32 s40, s6, s4 +; SI-NEXT: s_and_b32 s4, s69, 0xff +; SI-NEXT: s_lshl_b32 s6, s71, 8 +; SI-NEXT: s_or_b32 s50, s4, s6 +; SI-NEXT: v_readlane_b32 s4, v44, 35 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s29, s4, s5 -; SI-NEXT: v_readlane_b32 s4, v43, 5 -; SI-NEXT: v_readlane_b32 s5, v43, 4 +; SI-NEXT: v_readlane_b32 s6, v44, 34 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_or_b32 s14, s6, s4 +; SI-NEXT: v_readlane_b32 s4, v44, 31 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s45, s4, s5 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 -; SI-NEXT: s_or_b32 s46, s4, s5 -; SI-NEXT: s_and_b32 s4, s56, 0xff -; SI-NEXT: s_lshl_b32 s5, s47, 8 -; SI-NEXT: s_or_b32 s47, s4, s5 -; SI-NEXT: s_and_b32 s4, s58, 0xff -; SI-NEXT: s_lshl_b32 s5, s57, 8 -; SI-NEXT: s_or_b32 s56, s4, s5 -; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s59, 8 -; SI-NEXT: s_or_b32 s57, s4, s5 -; SI-NEXT: s_and_b32 s4, s62, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 -; SI-NEXT: s_or_b32 s58, s4, s5 -; SI-NEXT: s_and_b32 s4, s74, 0xff -; SI-NEXT: s_lshl_b32 s5, s63, 8 -; SI-NEXT: s_or_b32 s59, s4, s5 -; SI-NEXT: s_and_b32 s4, s73, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 -; SI-NEXT: s_or_b32 s60, s4, s5 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 -; SI-NEXT: s_or_b32 s61, s4, s5 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_or_b32 s62, s4, s5 -; SI-NEXT: s_and_b32 s4, s36, 0xff -; SI-NEXT: s_lshl_b32 s5, s16, 8 -; SI-NEXT: s_or_b32 s63, s4, s5 -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s9, 8 -; SI-NEXT: s_or_b32 s72, s4, s5 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: s_lshl_b32 s5, s78, 8 -; SI-NEXT: s_or_b32 s73, s4, s5 +; SI-NEXT: s_lshl_b32 s6, s97, 8 +; SI-NEXT: s_or_b32 vcc_hi, s4, s6 +; SI-NEXT: s_and_b32 s4, s85, 0xff +; SI-NEXT: v_readlane_b32 s6, v44, 30 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_or_b32 s12, s6, s4 +; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_lshl_b32 s6, s10, 8 +; SI-NEXT: s_or_b32 s54, s4, s6 ; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s91, 8 -; SI-NEXT: s_or_b32 s74, s4, s5 -; SI-NEXT: s_and_b32 s4, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 -; SI-NEXT: s_or_b32 s75, s4, s5 -; SI-NEXT: v_readlane_b32 s4, v43, 9 -; SI-NEXT: v_readlane_b32 s5, v43, 8 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s5, s4, s5 -; SI-NEXT: v_readlane_b32 s4, v43, 7 +; SI-NEXT: v_readlane_b32 s6, v44, 28 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_or_b32 s10, s6, s4 +; SI-NEXT: v_readlane_b32 s6, v44, 25 +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: s_or_b32 s66, s4, s6 +; SI-NEXT: v_readlane_b32 s4, v44, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s6, v43, 6 +; SI-NEXT: v_readlane_b32 s6, v44, 23 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 24 -; SI-NEXT: s_or_b32 s7, s6, s4 -; SI-NEXT: v_readlane_b32 s4, v43, 11 +; SI-NEXT: s_or_b32 s8, s6, s4 +; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_lshl_b32 s6, s22, 8 +; SI-NEXT: s_or_b32 s18, s4, s6 +; SI-NEXT: v_readlane_b32 s4, v44, 22 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s6, v43, 10 +; SI-NEXT: v_readlane_b32 s6, v44, 21 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: v_readlane_b32 s17, v44, 18 +; SI-NEXT: s_or_b32 s6, s6, s4 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s22, s17, 8 +; SI-NEXT: s_or_b32 s41, s4, s22 +; SI-NEXT: v_readlane_b32 s4, v44, 17 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s17, v44, 16 +; SI-NEXT: v_readlane_b32 s21, v44, 9 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s22, s17, 24 +; SI-NEXT: s_and_b32 s27, s21, 0xff +; SI-NEXT: v_readlane_b32 s21, v44, 8 +; SI-NEXT: s_or_b32 s4, s22, s4 +; SI-NEXT: s_lshl_b32 s22, s21, 8 +; SI-NEXT: v_readlane_b32 s16, v44, 7 +; SI-NEXT: s_or_b32 s69, s27, s22 +; SI-NEXT: s_and_b32 s22, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v44, 6 +; SI-NEXT: s_lshl_b32 s27, s22, 16 +; SI-NEXT: s_lshl_b32 s22, s16, 24 +; SI-NEXT: v_readlane_b32 s16, v44, 1 +; SI-NEXT: s_or_b32 s53, s22, s27 +; SI-NEXT: s_and_b32 s22, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v44, 0 +; SI-NEXT: s_lshl_b32 s27, s16, 8 +; SI-NEXT: s_or_b32 s71, s22, s27 +; SI-NEXT: s_and_b32 s22, s28, 0xff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s27, s43, 24 +; SI-NEXT: s_or_b32 s27, s27, s22 +; SI-NEXT: s_and_b32 s22, s88, 0xff +; SI-NEXT: s_lshl_b32 s28, s79, 8 +; SI-NEXT: v_writelane_b32 v42, s85, 2 +; SI-NEXT: s_or_b32 s85, s22, s28 +; SI-NEXT: s_and_b32 s22, s75, 0xff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s28, s77, 24 +; SI-NEXT: s_or_b32 s67, s28, s22 +; SI-NEXT: s_and_b32 s22, s31, 0xff +; SI-NEXT: s_lshl_b32 s28, s93, 8 +; SI-NEXT: v_writelane_b32 v42, s97, 3 +; SI-NEXT: s_or_b32 s97, s22, s28 +; SI-NEXT: s_and_b32 s22, s89, 0xff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s28, s91, 24 +; SI-NEXT: v_readlane_b32 s16, v43, 60 +; SI-NEXT: s_or_b32 s43, s28, s22 +; SI-NEXT: s_and_b32 s22, s16, 0xff +; SI-NEXT: s_lshl_b32 s28, s51, 8 +; SI-NEXT: s_or_b32 s77, s22, s28 +; SI-NEXT: s_and_b32 s22, s35, 0xff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s28, s37, 24 +; SI-NEXT: s_or_b32 s47, s28, s22 +; SI-NEXT: s_and_b32 s22, s39, 0xff +; SI-NEXT: s_lshl_b32 s28, s38, 8 +; SI-NEXT: s_or_b32 s99, s22, s28 +; SI-NEXT: s_and_b32 s22, s19, 0xff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s28, s48, 24 +; SI-NEXT: s_or_b32 s59, s28, s22 +; SI-NEXT: s_and_b32 s22, s56, 0xff +; SI-NEXT: s_lshl_b32 s28, s64, 8 +; SI-NEXT: s_or_b32 s87, s22, s28 +; SI-NEXT: s_and_b32 s22, s34, 0xff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s28, s23, 24 +; SI-NEXT: s_or_b32 s63, s28, s22 +; SI-NEXT: s_and_b32 s22, s58, 0xff +; SI-NEXT: s_lshl_b32 s28, s24, 8 +; SI-NEXT: s_or_b32 s88, s22, s28 +; SI-NEXT: s_and_b32 s22, s29, 0xff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s28, s36, 24 +; SI-NEXT: s_or_b32 s75, s28, s22 +; SI-NEXT: s_and_b32 s22, s74, 0xff +; SI-NEXT: s_lshl_b32 s28, s60, 8 +; SI-NEXT: s_or_b32 s89, s22, s28 +; SI-NEXT: s_and_b32 s22, s46, 0xff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s28, s45, 24 +; SI-NEXT: s_or_b32 s45, s28, s22 +; SI-NEXT: s_and_b32 s22, s94, 0xff +; SI-NEXT: s_lshl_b32 s28, s26, 8 +; SI-NEXT: s_or_b32 s91, s22, s28 +; SI-NEXT: s_and_b32 s22, s13, 0xff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s28, s92, 24 +; SI-NEXT: s_or_b32 s79, s28, s22 +; SI-NEXT: s_and_b32 s22, s20, 0xff +; SI-NEXT: s_lshl_b32 s28, s81, 8 +; SI-NEXT: v_readlane_b32 s16, v44, 33 +; SI-NEXT: s_or_b32 s17, s22, s28 +; SI-NEXT: s_and_b32 s22, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v44, 32 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s28, s16, 24 +; SI-NEXT: v_readlane_b32 s13, v44, 29 +; SI-NEXT: s_or_b32 s49, s28, s22 +; SI-NEXT: s_and_b32 s22, s13, 0xff +; SI-NEXT: s_lshl_b32 s28, s72, 8 +; SI-NEXT: s_or_b32 s13, s22, s28 +; SI-NEXT: s_and_b32 s22, s83, 0xff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s28, s30, 24 +; SI-NEXT: s_or_b32 s51, s28, s22 +; SI-NEXT: v_readlane_b32 s22, v43, 61 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_lshl_b32 s28, s62, 8 +; SI-NEXT: s_or_b32 s16, s22, s28 +; SI-NEXT: v_readlane_b32 s22, v44, 27 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: v_readlane_b32 s23, v44, 26 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s28, s23, 24 +; SI-NEXT: s_or_b32 s64, s28, s22 +; SI-NEXT: s_and_b32 s22, s25, 0xff +; SI-NEXT: s_lshl_b32 s28, s55, 8 +; SI-NEXT: s_or_b32 s19, s22, s28 +; SI-NEXT: s_and_b32 s22, s73, 0xff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s28, s61, 24 +; SI-NEXT: s_or_b32 s25, s28, s22 +; SI-NEXT: s_and_b32 s22, s57, 0xff +; SI-NEXT: s_lshl_b32 s28, s44, 8 +; SI-NEXT: s_or_b32 s26, s22, s28 +; SI-NEXT: v_readlane_b32 s22, v44, 20 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: v_readlane_b32 s23, v44, 19 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s28, s23, 24 +; SI-NEXT: s_or_b32 s57, s28, s22 +; SI-NEXT: s_lshl_b32 s28, s5, 8 +; SI-NEXT: v_readlane_b32 s5, v43, 57 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_writelane_b32 v42, s7, 51 -; SI-NEXT: s_or_b32 s4, s6, s4 -; SI-NEXT: s_or_b32 s5, s5, s7 -; SI-NEXT: v_readlane_b32 s6, v43, 1 -; SI-NEXT: v_readlane_b32 s7, v43, 0 -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_or_b32 s7, s6, s7 -; SI-NEXT: s_and_b32 s6, s11, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s12, 24 -; SI-NEXT: s_or_b32 s37, s8, s6 -; SI-NEXT: v_readlane_b32 s6, v43, 3 -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s8, v43, 2 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s8, 24 -; SI-NEXT: s_or_b32 s6, s8, s6 -; SI-NEXT: s_and_b32 s8, s18, 0xff -; SI-NEXT: s_lshl_b32 s9, s15, 8 -; SI-NEXT: s_or_b32 s9, s8, s9 -; SI-NEXT: s_and_b32 s8, s13, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s10, s14, 24 -; SI-NEXT: s_or_b32 s68, s10, s8 -; SI-NEXT: s_and_b32 s8, s21, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s10, s22, 24 -; SI-NEXT: s_or_b32 s8, s10, s8 -; SI-NEXT: s_and_b32 s10, s77, 0xff -; SI-NEXT: s_lshl_b32 s11, s76, 8 -; SI-NEXT: s_or_b32 s11, s10, s11 -; SI-NEXT: s_and_b32 s10, s40, 0xff -; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s12, s41, 24 -; SI-NEXT: s_or_b32 s99, s12, s10 -; SI-NEXT: s_and_b32 s10, vcc_lo, 0xff -; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s12, vcc_hi, 24 -; SI-NEXT: s_or_b32 s10, s12, s10 -; SI-NEXT: s_and_b32 s12, s49, 0xff -; SI-NEXT: s_lshl_b32 s13, s48, 8 -; SI-NEXT: s_or_b32 s13, s12, s13 -; SI-NEXT: s_and_b32 s12, s38, 0xff -; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s14, s39, 24 -; SI-NEXT: s_or_b32 s92, s14, s12 -; SI-NEXT: s_and_b32 s12, s50, 0xff -; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s14, s51, 24 -; SI-NEXT: s_or_b32 s12, s14, s12 -; SI-NEXT: s_and_b32 s14, s55, 0xff -; SI-NEXT: s_lshl_b32 s15, s54, 8 -; SI-NEXT: s_or_b32 s15, s14, s15 -; SI-NEXT: s_and_b32 s14, s52, 0xff -; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s24, s53, 24 -; SI-NEXT: s_mov_b32 s28, s90 -; SI-NEXT: s_or_b32 s90, s24, s14 -; SI-NEXT: s_and_b32 s14, s64, 0xff -; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s25, s65, 24 -; SI-NEXT: s_or_b32 s14, s25, s14 -; SI-NEXT: s_and_b32 s25, s34, 0xff -; SI-NEXT: s_lshl_b32 s40, s86, 8 -; SI-NEXT: s_or_b32 s41, s25, s40 -; SI-NEXT: s_and_b32 s25, s80, 0xff -; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: s_lshl_b32 s40, s81, 24 -; SI-NEXT: s_or_b32 s18, s40, s25 -; SI-NEXT: s_and_b32 s40, s31, 0xff -; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_lshl_b32 s42, s84, 24 -; SI-NEXT: s_or_b32 s40, s42, s40 -; SI-NEXT: s_and_b32 s42, s35, 0xff -; SI-NEXT: s_lshl_b32 s43, s97, 8 -; SI-NEXT: s_or_b32 s43, s42, s43 -; SI-NEXT: s_and_b32 s42, s71, 0xff -; SI-NEXT: s_lshl_b32 s42, s42, 16 -; SI-NEXT: s_lshl_b32 s76, s67, 24 -; SI-NEXT: s_or_b32 s35, s76, s42 -; SI-NEXT: s_and_b32 s42, s87, 0xff -; SI-NEXT: s_lshl_b32 s42, s42, 16 -; SI-NEXT: s_lshl_b32 s76, s83, 24 -; SI-NEXT: s_or_b32 s42, s76, s42 -; SI-NEXT: s_and_b32 s76, s19, 0xff -; SI-NEXT: s_lshl_b32 s77, s20, 8 -; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s66, 0xff -; SI-NEXT: v_writelane_b32 v42, s78, 52 -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s85, 24 -; SI-NEXT: s_or_b32 s19, s78, s77 -; SI-NEXT: s_and_b32 s77, s94, 0xff -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s27, 24 -; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: s_or_b32 vcc_lo, s78, s77 -; SI-NEXT: s_or_b32 vcc_hi, s76, s19 -; SI-NEXT: s_and_b32 s76, s26, 0xff -; SI-NEXT: s_lshl_b32 s77, s23, 8 -; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s98, 0xff -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 -; SI-NEXT: s_or_b32 s71, s78, s77 -; SI-NEXT: s_and_b32 s77, s79, 0xff -; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 40 -; SI-NEXT: s_and_b32 s41, s41, 0xffff -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s88, 24 -; SI-NEXT: s_or_b32 s39, s76, s71 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 39 -; SI-NEXT: s_or_b32 s41, s41, s18 -; SI-NEXT: s_mov_b32 s31, s18 -; SI-NEXT: s_or_b32 s38, s78, s77 -; SI-NEXT: s_lshl_b32 s77, s17, 8 -; SI-NEXT: v_readlane_b32 s18, v43, 38 -; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s18, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 37 -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s18, 24 -; SI-NEXT: s_or_b32 s80, s78, s77 -; SI-NEXT: s_and_b32 s77, s95, 0xff -; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 36 -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s93, 24 -; SI-NEXT: s_or_b32 s49, s76, s80 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 35 -; SI-NEXT: s_or_b32 s48, s78, s77 -; SI-NEXT: s_lshl_b32 s77, s17, 8 -; SI-NEXT: v_readlane_b32 s17, v43, 34 -; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 33 -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 -; SI-NEXT: s_or_b32 s81, s78, s77 -; SI-NEXT: s_and_b32 s77, s30, 0xff -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s69, 24 -; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 31 -; SI-NEXT: s_or_b32 s50, s78, s77 -; SI-NEXT: s_or_b32 s51, s76, s81 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: s_lshl_b32 s77, s96, 8 -; SI-NEXT: v_readlane_b32 s17, v43, 30 -; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s82, 24 -; SI-NEXT: v_writelane_b32 v42, s96, 53 -; SI-NEXT: v_readlane_b32 s18, v43, 32 -; SI-NEXT: v_writelane_b32 v42, s82, 54 -; SI-NEXT: s_or_b32 s82, s78, s77 -; SI-NEXT: s_and_b32 s77, s18, 0xff -; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 28 -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s70, 24 -; SI-NEXT: s_or_b32 s53, s76, s82 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 27 -; SI-NEXT: s_or_b32 s52, s78, s77 -; SI-NEXT: s_lshl_b32 s77, s17, 8 -; SI-NEXT: v_readlane_b32 s18, v43, 26 -; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s18, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 25 -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 -; SI-NEXT: v_writelane_b32 v42, s16, 55 -; SI-NEXT: s_or_b32 s16, s78, s77 -; SI-NEXT: s_and_b32 s77, s89, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 29 -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s18, 24 -; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 22 -; SI-NEXT: v_readlane_b32 s18, v43, 21 -; SI-NEXT: s_or_b32 s54, s78, s77 -; SI-NEXT: s_or_b32 s55, s76, s16 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: s_lshl_b32 s77, s18, 8 -; SI-NEXT: v_readlane_b32 s17, v43, 20 -; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 19 -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 -; SI-NEXT: v_readlane_b32 s17, v43, 24 -; SI-NEXT: s_or_b32 s83, s78, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 23 -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 -; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 17 -; SI-NEXT: v_readlane_b32 s18, v43, 16 -; SI-NEXT: s_or_b32 s64, s78, s77 -; SI-NEXT: s_or_b32 s65, s76, s83 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: s_lshl_b32 s77, s18, 8 -; SI-NEXT: v_readlane_b32 s18, v43, 15 -; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s18, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 14 -; SI-NEXT: v_writelane_b32 v42, s89, 56 -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s18, 24 -; SI-NEXT: v_writelane_b32 v42, s70, 57 -; SI-NEXT: s_or_b32 s85, s78, s77 -; SI-NEXT: s_and_b32 s77, s44, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 18 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_and_b32 s11, s11, 0xffff -; SI-NEXT: s_and_b32 s13, s13, 0xffff -; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_and_b32 s43, s43, 0xffff -; SI-NEXT: v_writelane_b32 v42, s69, 58 -; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s18, 24 -; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: s_and_b32 s44, s29, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s37 -; SI-NEXT: s_or_b32 s9, s9, s68 -; SI-NEXT: s_or_b32 s11, s11, s99 -; SI-NEXT: s_or_b32 s13, s13, s92 -; SI-NEXT: s_or_b32 s15, s15, s90 -; SI-NEXT: s_or_b32 s43, s43, s35 -; SI-NEXT: v_writelane_b32 v42, s30, 59 -; SI-NEXT: s_mov_b32 s23, s91 -; SI-NEXT: s_mov_b32 s91, s36 -; SI-NEXT: s_or_b32 s66, s78, s77 -; SI-NEXT: s_or_b32 s67, s76, s85 -; SI-NEXT: s_and_b32 s45, s45, 0xffff -; SI-NEXT: s_and_b32 s46, s46, 0xffff -; SI-NEXT: s_and_b32 s47, s47, 0xffff -; SI-NEXT: s_and_b32 s56, s56, 0xffff -; SI-NEXT: s_and_b32 s57, s57, 0xffff -; SI-NEXT: s_and_b32 s30, s58, 0xffff -; SI-NEXT: s_and_b32 s34, s59, 0xffff -; SI-NEXT: s_and_b32 s36, s60, 0xffff -; SI-NEXT: s_and_b32 s97, s61, 0xffff -; SI-NEXT: s_and_b32 s86, s62, 0xffff -; SI-NEXT: s_and_b32 s98, s63, 0xffff -; SI-NEXT: s_and_b32 s17, s72, 0xffff -; SI-NEXT: s_and_b32 s87, s73, 0xffff -; SI-NEXT: s_and_b32 s96, s74, 0xffff -; SI-NEXT: s_and_b32 s22, s75, 0xffff -; SI-NEXT: s_or_b32 s74, s44, s4 -; SI-NEXT: s_mov_b32 s75, s5 -; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 -; SI-NEXT: s_mov_b32 s70, s93 -; SI-NEXT: s_mov_b32 s69, s95 -; SI-NEXT: s_mov_b32 s93, s28 -; SI-NEXT: s_or_b32 s72, s45, s6 -; SI-NEXT: s_mov_b32 s73, s7 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 -; SI-NEXT: s_or_b32 s62, s46, s8 -; SI-NEXT: s_mov_b32 s63, s9 -; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 -; SI-NEXT: s_or_b32 s60, s47, s10 -; SI-NEXT: s_mov_b32 s61, s11 -; SI-NEXT: s_lshr_b64 s[88:89], s[10:11], 16 -; SI-NEXT: s_or_b32 s58, s56, s12 -; SI-NEXT: s_mov_b32 s59, s13 -; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 -; SI-NEXT: s_or_b32 s56, s57, s14 -; SI-NEXT: s_mov_b32 s57, s15 -; SI-NEXT: s_lshr_b64 s[24:25], s[14:15], 16 -; SI-NEXT: s_or_b32 s46, s30, s40 -; SI-NEXT: s_mov_b32 s47, s41 -; SI-NEXT: s_or_b32 s44, s34, s42 -; SI-NEXT: s_mov_b32 s34, s4 -; SI-NEXT: s_mov_b32 s45, s43 -; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 -; SI-NEXT: s_or_b32 s42, s36, vcc_lo -; SI-NEXT: s_mov_b32 s43, vcc_hi -; SI-NEXT: s_lshr_b64 vcc, vcc, 16 -; SI-NEXT: s_or_b32 s40, s97, s38 -; SI-NEXT: s_mov_b32 s41, s39 -; SI-NEXT: s_lshr_b64 s[38:39], s[38:39], 16 -; SI-NEXT: s_or_b32 s14, s86, s48 -; SI-NEXT: s_mov_b32 s15, s49 -; SI-NEXT: s_lshr_b64 s[48:49], s[48:49], 16 -; SI-NEXT: s_or_b32 s12, s98, s50 -; SI-NEXT: s_mov_b32 s13, s51 -; SI-NEXT: s_lshr_b64 s[50:51], s[50:51], 16 -; SI-NEXT: s_or_b32 s10, s17, s52 -; SI-NEXT: s_mov_b32 s11, s53 -; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 16 -; SI-NEXT: s_or_b32 s8, s87, s54 -; SI-NEXT: s_mov_b32 s9, s55 -; SI-NEXT: s_lshr_b64 s[54:55], s[54:55], 16 -; SI-NEXT: s_or_b32 s6, s96, s64 -; SI-NEXT: s_mov_b32 s7, s65 -; SI-NEXT: s_lshr_b64 s[64:65], s[64:65], 16 -; SI-NEXT: s_or_b32 s4, s22, s66 -; SI-NEXT: s_mov_b32 s5, s67 -; SI-NEXT: s_lshr_b64 s[66:67], s[66:67], 16 -; SI-NEXT: v_readlane_b32 s17, v42, 51 -; SI-NEXT: s_lshr_b32 s55, s17, 16 -; SI-NEXT: s_lshr_b32 s53, s37, 16 -; SI-NEXT: s_lshr_b32 s51, s68, 16 -; SI-NEXT: s_lshr_b32 s49, s99, 16 -; SI-NEXT: s_lshr_b32 s86, s92, 16 -; SI-NEXT: s_lshr_b32 s39, s90, 16 -; SI-NEXT: s_lshr_b32 s18, s31, 16 -; SI-NEXT: s_lshr_b32 s22, s35, 16 -; SI-NEXT: s_lshr_b32 s97, s19, 16 -; SI-NEXT: s_lshr_b32 s65, s71, 16 -; SI-NEXT: s_lshr_b32 s19, s80, 16 -; SI-NEXT: s_lshr_b32 s71, s81, 16 -; SI-NEXT: s_lshr_b32 s67, s82, 16 -; SI-NEXT: v_readlane_b32 s82, v42, 54 -; SI-NEXT: v_readlane_b32 s96, v42, 53 -; SI-NEXT: s_lshr_b32 s80, s16, 16 -; SI-NEXT: v_readlane_b32 s16, v42, 55 -; SI-NEXT: s_lshr_b32 s81, s83, 16 -; SI-NEXT: s_mov_b32 s90, s93 -; SI-NEXT: v_readlane_b32 s78, v42, 52 -; SI-NEXT: s_mov_b32 s95, s69 -; SI-NEXT: s_mov_b32 s93, s70 -; SI-NEXT: v_readlane_b32 s30, v42, 59 -; SI-NEXT: v_readlane_b32 s69, v42, 58 -; SI-NEXT: v_readlane_b32 s70, v42, 57 -; SI-NEXT: v_readlane_b32 s89, v42, 56 -; SI-NEXT: s_lshr_b32 s77, s85, 16 -; SI-NEXT: s_mov_b32 s84, vcc_lo -; SI-NEXT: s_mov_b32 s36, s91 -; SI-NEXT: s_mov_b32 s91, s23 +; SI-NEXT: s_or_b32 s74, s5, vcc_lo +; SI-NEXT: v_readlane_b32 s5, v43, 55 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s72, s5, s68 +; SI-NEXT: v_readlane_b32 s5, v43, 53 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s62, s5, s70 +; SI-NEXT: s_and_b32 s5, s11, 0xffff +; SI-NEXT: s_or_b32 s60, s5, s80 +; SI-NEXT: s_and_b32 s5, s9, 0xffff +; SI-NEXT: s_or_b32 s58, s5, s82 +; SI-NEXT: s_and_b32 s5, s15, 0xffff +; SI-NEXT: s_or_b32 s56, s5, s84 +; SI-NEXT: s_and_b32 s5, s78, 0xffff +; SI-NEXT: s_or_b32 s46, s5, s86 +; SI-NEXT: s_and_b32 s5, s42, 0xffff +; SI-NEXT: s_or_b32 s44, s5, s96 +; SI-NEXT: s_and_b32 s5, s52, 0xffff +; SI-NEXT: s_or_b32 s42, s5, s98 +; SI-NEXT: s_and_b32 s5, s76, 0xffff +; SI-NEXT: s_or_b32 s76, s5, s40 +; SI-NEXT: s_and_b32 s5, s50, 0xffff +; SI-NEXT: s_or_b32 s50, s5, s14 +; SI-NEXT: s_and_b32 s5, vcc_hi, 0xffff +; SI-NEXT: s_or_b32 s52, s5, s12 +; SI-NEXT: s_and_b32 s5, s54, 0xffff +; SI-NEXT: s_or_b32 s54, s5, s10 +; SI-NEXT: s_and_b32 s5, s66, 0xffff +; SI-NEXT: s_or_b32 s78, s5, s8 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_or_b32 s66, s5, s6 +; SI-NEXT: s_and_b32 s5, s41, 0xffff +; SI-NEXT: s_or_b32 s24, s5, s4 +; SI-NEXT: s_and_b32 s5, s69, 0xffff +; SI-NEXT: s_or_b32 vcc_hi, s5, s53 +; SI-NEXT: s_and_b32 s5, s71, 0xffff +; SI-NEXT: s_or_b32 s69, s5, s27 +; SI-NEXT: s_and_b32 s5, s85, 0xffff +; SI-NEXT: s_or_b32 s71, s5, s67 +; SI-NEXT: s_and_b32 s5, s97, 0xffff +; SI-NEXT: s_or_b32 s81, s5, s43 +; SI-NEXT: s_and_b32 s5, s77, 0xffff +; SI-NEXT: s_or_b32 s83, s5, s47 +; SI-NEXT: s_and_b32 s5, s99, 0xffff +; SI-NEXT: s_or_b32 s85, s5, s59 +; SI-NEXT: s_and_b32 s5, s87, 0xffff +; SI-NEXT: s_or_b32 s87, s5, s63 +; SI-NEXT: s_and_b32 s5, s88, 0xffff +; SI-NEXT: s_or_b32 s97, s5, s75 +; SI-NEXT: s_and_b32 s5, s89, 0xffff +; SI-NEXT: s_or_b32 s99, s5, s45 +; SI-NEXT: s_and_b32 s5, s91, 0xffff +; SI-NEXT: v_readlane_b32 s22, v43, 59 +; SI-NEXT: s_or_b32 s41, s5, s79 +; SI-NEXT: s_and_b32 s5, s17, 0xffff +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_lshr_b64 s[20:21], vcc, 16 +; SI-NEXT: s_or_b32 s15, s5, s49 +; SI-NEXT: s_and_b32 s5, s13, 0xffff +; SI-NEXT: s_or_b32 s61, s22, s28 +; SI-NEXT: v_readlane_b32 s22, v44, 15 +; SI-NEXT: v_writelane_b32 v43, s20, 53 +; SI-NEXT: s_or_b32 s13, s5, s51 +; SI-NEXT: s_and_b32 s5, s16, 0xffff +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: v_readlane_b32 s23, v44, 14 +; SI-NEXT: v_writelane_b32 v43, s21, 54 +; SI-NEXT: s_lshr_b64 s[20:21], s[68:69], 16 +; SI-NEXT: s_or_b32 s11, s5, s64 +; SI-NEXT: s_and_b32 s5, s19, 0xffff +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s28, s23, 24 +; SI-NEXT: v_writelane_b32 v43, s20, 55 +; SI-NEXT: s_or_b32 s9, s5, s25 +; SI-NEXT: s_and_b32 s5, s26, 0xffff +; SI-NEXT: s_or_b32 s48, s28, s22 +; SI-NEXT: v_writelane_b32 v43, s21, 56 +; SI-NEXT: s_lshr_b64 s[20:21], s[70:71], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[98:99], 16 +; SI-NEXT: s_mov_b32 s98, s7 +; SI-NEXT: s_or_b32 s7, s5, s57 +; SI-NEXT: s_and_b32 s5, s61, 0xffff +; SI-NEXT: v_writelane_b32 v43, s20, 57 +; SI-NEXT: s_or_b32 s5, s5, s48 +; SI-NEXT: v_writelane_b32 v43, s21, 58 +; SI-NEXT: s_lshr_b64 s[22:23], s[80:81], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[18:19], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[82:83], 16 +; SI-NEXT: v_readlane_b32 s82, v43, 62 +; SI-NEXT: s_lshr_b64 s[34:35], s[84:85], 16 +; SI-NEXT: v_readlane_b32 s84, v42, 0 +; SI-NEXT: s_lshr_b64 s[30:31], s[86:87], 16 +; SI-NEXT: v_readlane_b32 s86, v42, 1 +; SI-NEXT: s_lshr_b64 s[94:95], s[96:97], 16 +; SI-NEXT: s_mov_b32 s96, s65 +; SI-NEXT: s_lshr_b64 s[36:37], s[12:13], 16 +; SI-NEXT: s_lshr_b32 s91, s53, 16 +; SI-NEXT: s_lshr_b32 s29, s27, 16 +; SI-NEXT: s_mov_b32 s26, s28 +; SI-NEXT: s_lshr_b32 s27, s67, 16 +; SI-NEXT: s_lshr_b32 s23, s43, 16 +; SI-NEXT: s_lshr_b32 s21, s47, 16 +; SI-NEXT: s_lshr_b32 s19, s59, 16 +; SI-NEXT: s_lshr_b32 s17, s63, 16 +; SI-NEXT: s_lshr_b32 s89, s75, 16 +; SI-NEXT: s_lshr_b32 s68, s45, 16 +; SI-NEXT: s_lshr_b32 s40, s79, 16 +; SI-NEXT: s_lshr_b32 s28, s49, 16 +; SI-NEXT: s_lshr_b32 s14, s51, 16 +; SI-NEXT: s_lshr_b32 s12, s64, 16 +; SI-NEXT: s_lshr_b32 s10, s25, 16 +; SI-NEXT: s_lshr_b32 s8, s57, 16 +; SI-NEXT: s_lshr_b32 s6, s48, 16 +; SI-NEXT: s_mov_b32 s75, vcc_hi +; SI-NEXT: s_mov_b32 s73, s69 +; SI-NEXT: s_mov_b32 s63, s71 +; SI-NEXT: s_mov_b32 s61, s81 +; SI-NEXT: s_mov_b32 s59, s83 +; SI-NEXT: v_readlane_b32 s80, v43, 60 +; SI-NEXT: s_mov_b32 s57, s85 +; SI-NEXT: v_readlane_b32 s85, v42, 2 +; SI-NEXT: s_mov_b32 s47, s87 +; SI-NEXT: s_mov_b32 s45, s97 +; SI-NEXT: v_readlane_b32 s97, v42, 3 +; SI-NEXT: s_mov_b32 s43, s99 +; SI-NEXT: s_mov_b32 s77, s41 +; SI-NEXT: s_mov_b32 s51, s15 +; SI-NEXT: v_readlane_b32 s65, v43, 63 +; SI-NEXT: s_mov_b32 s53, s13 +; SI-NEXT: v_readlane_b32 s15, v43, 61 +; SI-NEXT: s_mov_b32 s55, s11 +; SI-NEXT: s_mov_b32 s79, s9 +; SI-NEXT: s_mov_b32 s67, s7 +; SI-NEXT: v_readlane_b32 s70, v43, 59 +; SI-NEXT: s_mov_b32 s25, s5 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true -; SI-NEXT: v_readlane_b32 s4, v43, 42 +; SI-NEXT: v_readlane_b32 s4, v44, 50 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s6, v43, 41 +; SI-NEXT: v_readlane_b32 s5, v44, 18 +; SI-NEXT: v_readlane_b32 s6, v44, 17 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v43, 18 +; SI-NEXT: v_readlane_b32 s5, v44, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 @@ -204189,15 +203615,14 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v43, 17 -; SI-NEXT: s_add_i32 s5, s5, 3 -; SI-NEXT: v_readlane_b32 s6, v43, 16 -; SI-NEXT: v_readlane_b32 s7, v43, 15 +; SI-NEXT: s_add_i32 s5, s70, 3 +; SI-NEXT: v_readlane_b32 s6, v44, 43 +; SI-NEXT: v_readlane_b32 s7, v44, 15 ; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 8 ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readlane_b32 s6, v43, 14 +; SI-NEXT: v_readlane_b32 s6, v44, 14 ; SI-NEXT: s_and_b32 s7, s7, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 24 ; SI-NEXT: s_lshl_b32 s7, s7, 16 @@ -204205,14 +203630,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readlane_b32 s6, v43, 44 +; SI-NEXT: v_readlane_b32 s6, v44, 55 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_readlane_b32 s8, v43, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 49 +; SI-NEXT: v_readlane_b32 s8, v44, 22 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s7, s91, 8 +; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_readlane_b32 s7, v43, 23 +; SI-NEXT: v_readlane_b32 s7, v44, 21 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 @@ -204220,739 +203646,750 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_readlane_b32 s7, v43, 22 +; SI-NEXT: s_add_i32 s24, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v43, 33 +; SI-NEXT: s_add_i32 s25, s5, 0x3000000 +; SI-NEXT: s_add_i32 s66, s6, 0x3000000 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v43, 31 +; SI-NEXT: v_readlane_b32 s6, v43, 26 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v43, 27 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s43, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v43, 42 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v43, 39 +; SI-NEXT: v_readlane_b32 s6, v43, 36 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v43, 35 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s44, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v43, 30 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v43, 24 +; SI-NEXT: v_readlane_b32 s6, v43, 18 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v43, 17 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s45, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v43, 34 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v43, 32 +; SI-NEXT: v_readlane_b32 s6, v43, 28 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v43, 29 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s46, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v43, 25 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v43, 23 +; SI-NEXT: v_readlane_b32 s6, v43, 19 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v43, 20 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s47, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v43, 22 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v43, 21 +; SI-NEXT: v_readlane_b32 s6, v43, 15 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v43, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s56, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v43, 14 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v43, 13 +; SI-NEXT: v_readlane_b32 s6, v43, 9 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v43, 10 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s57, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v43, 12 +; SI-NEXT: v_readlane_b32 s7, v44, 40 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v43, 11 +; SI-NEXT: v_readlane_b32 s6, v43, 7 ; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: v_readlane_b32 s8, v43, 21 -; SI-NEXT: v_readlane_b32 s9, v43, 20 +; SI-NEXT: v_readlane_b32 s8, v44, 36 +; SI-NEXT: v_readlane_b32 s9, v44, 20 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_and_b32 s7, s7, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v43, 8 +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_readlane_b32 s8, v43, 19 +; SI-NEXT: v_readlane_b32 s8, v44, 19 ; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s8, s8, 24 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_readlane_b32 s8, v43, 43 +; SI-NEXT: v_readlane_b32 s8, v44, 54 +; SI-NEXT: s_add_i32 s58, s4, 0x3000000 +; SI-NEXT: s_add_i32 s4, s80, 3 +; SI-NEXT: v_readlane_b32 s5, v43, 6 +; SI-NEXT: v_readlane_b32 s6, v43, 2 ; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: v_readlane_b32 s9, v44, 25 +; SI-NEXT: v_readlane_b32 s10, v44, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s9, s78, 8 -; SI-NEXT: s_add_i32 s10, s89, 3 +; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v43, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readlane_b32 s9, v43, 29 +; SI-NEXT: v_readlane_b32 s9, v44, 23 ; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s9, s9, 24 ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readlane_b32 s9, v43, 28 +; SI-NEXT: v_readlane_b32 s9, v44, 44 +; SI-NEXT: s_add_i32 s59, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v43, 5 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: v_readlane_b32 s10, v43, 27 -; SI-NEXT: v_readlane_b32 s11, v43, 26 +; SI-NEXT: v_readlane_b32 s10, v44, 42 +; SI-NEXT: v_readlane_b32 s11, v44, 38 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v43, 4 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s96, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_readlane_b32 s10, v43, 25 +; SI-NEXT: v_readlane_b32 s10, v44, 39 ; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 24 ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s98, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_readlane_b32 s10, v43, 46 +; SI-NEXT: v_readlane_b32 s10, v44, 57 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: v_readlane_b32 s11, v43, 45 -; SI-NEXT: v_readlane_b32 s12, v43, 32 +; SI-NEXT: v_readlane_b32 s11, v44, 56 +; SI-NEXT: v_readlane_b32 s12, v44, 53 +; SI-NEXT: s_add_i32 s60, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v43, 1 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v43, 0 +; SI-NEXT: v_readlane_b32 s6, v44, 62 ; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_readlane_b32 s11, v44, 28 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s11, s70, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_lshl_b32 s11, s11, 24 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v44, 63 +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_readlane_b32 s11, v43, 31 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: v_readlane_b32 s13, v43, 30 +; SI-NEXT: s_add_i32 s11, s15, 3 +; SI-NEXT: v_readlane_b32 s12, v44, 37 +; SI-NEXT: v_readlane_b32 s13, v44, 27 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: s_lshl_b32 s12, s96, 8 +; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_readlane_b32 s12, v44, 26 ; SI-NEXT: s_and_b32 s13, s13, 0xff -; SI-NEXT: s_lshl_b32 s12, s82, 24 +; SI-NEXT: s_add_i32 s61, s4, 0x3000000 +; SI-NEXT: s_add_i32 s4, s86, 3 +; SI-NEXT: s_lshl_b32 s12, s12, 24 ; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s84, 8 +; SI-NEXT: s_add_i32 s6, s65, 3 ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: s_add_i32 s12, s36, 3 +; SI-NEXT: v_readlane_b32 s12, v44, 31 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s82, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s13, s16, 8 -; SI-NEXT: s_add_i32 s14, s30, 3 +; SI-NEXT: s_lshl_b32 s13, s97, 8 +; SI-NEXT: s_add_i32 s14, s85, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_readlane_b32 s13, v44, 30 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s13, s69, 24 +; SI-NEXT: s_add_i32 s62, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v44, 61 +; SI-NEXT: s_lshl_b32 s13, s13, 24 ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v44, 60 +; SI-NEXT: v_readlane_b32 s6, v44, 58 ; SI-NEXT: s_or_b32 s13, s13, s14 ; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v43, 36 +; SI-NEXT: v_readlane_b32 s13, v44, 29 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v44, 59 +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_readlane_b32 s14, v43, 35 -; SI-NEXT: v_readlane_b32 s15, v43, 34 +; SI-NEXT: v_readlane_b32 s14, v44, 48 +; SI-NEXT: v_readlane_b32 s15, v44, 45 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: v_readlane_b32 s14, v43, 33 +; SI-NEXT: v_readlane_b32 s14, v44, 41 ; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_lshl_b32 s14, s14, 24 ; SI-NEXT: s_lshl_b32 s15, s15, 16 ; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_add_i32 s63, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v44, 5 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v44, 4 +; SI-NEXT: v_readlane_b32 s6, v44, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: v_readlane_b32 s14, v42, 50 -; SI-NEXT: s_add_i32 s17, s14, 3 -; SI-NEXT: v_readlane_b32 s15, v42, 49 -; SI-NEXT: s_and_b32 s14, s17, 0xff +; SI-NEXT: v_readlane_b32 s14, v43, 52 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: v_readlane_b32 s15, v43, 51 +; SI-NEXT: v_readlane_b32 s16, v44, 35 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v44, 2 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_and_b32 s14, s14, 0xff ; SI-NEXT: s_lshl_b32 s15, s15, 8 -; SI-NEXT: s_add_i32 s16, s95, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_readlane_b32 s15, v44, 34 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s15, s93, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_lshl_b32 s15, s15, 24 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_or_b32 s15, s15, s16 ; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_add_i32 s72, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v44, 1 ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_readlane_b32 s15, v43, 40 +; SI-NEXT: v_readlane_b32 s15, v44, 47 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v44, 0 +; SI-NEXT: v_readlane_b32 s6, v44, 51 ; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: v_readlane_b32 s16, v43, 39 -; SI-NEXT: v_readlane_b32 s17, v43, 38 +; SI-NEXT: v_readlane_b32 s16, v44, 46 +; SI-NEXT: v_readlane_b32 s17, v44, 33 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_and_b32 s15, s15, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v44, 52 +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_readlane_b32 s16, v43, 37 +; SI-NEXT: v_readlane_b32 s16, v44, 32 ; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s16, 24 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_addk_i32 s15, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_readlane_b32 s16, v42, 48 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 47 -; SI-NEXT: v_readlane_b32 s18, v42, 42 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s99, s18, 3 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 43 -; SI-NEXT: s_and_b32 s18, s99, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 38 -; SI-NEXT: s_add_i32 s87, s17, 3 -; SI-NEXT: v_readlane_b32 s18, v42, 35 -; SI-NEXT: v_readlane_b32 s19, v42, 29 -; SI-NEXT: s_and_b32 s17, s87, 0xff -; SI-NEXT: s_lshl_b32 s18, s18, 8 -; SI-NEXT: s_add_i32 s23, s19, 3 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v42, 30 -; SI-NEXT: s_and_b32 s23, s23, 0xff -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_lshl_b32 s23, s23, 16 -; SI-NEXT: s_addk_i32 s17, 0x300 -; SI-NEXT: s_or_b32 s18, s18, s23 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_add_i32 s40, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 45 -; SI-NEXT: s_add_i32 s41, s17, 0x3000000 -; SI-NEXT: s_add_i32 s68, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 44 -; SI-NEXT: v_readlane_b32 s18, v42, 39 -; SI-NEXT: s_and_b32 s16, s68, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s96, s18, 3 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 40 -; SI-NEXT: s_and_b32 s18, s96, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 33 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_readlane_b32 s18, v42, 31 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: s_lshl_b32 s18, s18, 8 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v42, 24 -; SI-NEXT: s_addk_i32 s17, 0x300 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s42, s16, 0x3000000 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: v_readlane_b32 s17, v42, 25 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s43, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 46 -; SI-NEXT: s_add_i32 s23, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 41 -; SI-NEXT: v_readlane_b32 s18, v42, 37 -; SI-NEXT: s_and_b32 s16, s23, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s86, s18, 3 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 36 -; SI-NEXT: s_and_b32 s18, s86, 0xff -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s44, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 28 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 22 -; SI-NEXT: v_readlane_b32 s18, v42, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 15 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s45, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 34 -; SI-NEXT: s_add_i32 s83, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 32 -; SI-NEXT: v_readlane_b32 s18, v42, 26 -; SI-NEXT: s_and_b32 s16, s83, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 27 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s46, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 23 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 21 -; SI-NEXT: v_readlane_b32 s18, v42, 17 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 18 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s47, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 20 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 19 -; SI-NEXT: v_readlane_b32 s18, v42, 13 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 14 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s56, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 12 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 11 -; SI-NEXT: v_readlane_b32 s18, v42, 7 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 8 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s57, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 10 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 9 -; SI-NEXT: v_readlane_b32 s18, v42, 5 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 6 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s58, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 4 +; SI-NEXT: v_readlane_b32 s16, v43, 50 +; SI-NEXT: s_add_i32 s73, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v44, 13 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 3 -; SI-NEXT: v_readlane_b32 s18, v43, 63 +; SI-NEXT: v_readlane_b32 s17, v43, 49 +; SI-NEXT: v_readlane_b32 s18, v43, 47 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v44, 12 +; SI-NEXT: v_readlane_b32 s6, v44, 11 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 0 +; SI-NEXT: v_readlane_b32 s17, v43, 48 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v44, 10 +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s59, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 2 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 1 -; SI-NEXT: v_readlane_b32 s18, v43, 61 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 62 -; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s60, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 60 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 59 -; SI-NEXT: v_readlane_b32 s18, v43, 55 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 56 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s61, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 58 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s28, s17, s16 +; SI-NEXT: v_readlane_b32 s16, v43, 41 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 57 -; SI-NEXT: v_readlane_b32 s18, v43, 53 +; SI-NEXT: v_readlane_b32 s17, v43, 40 +; SI-NEXT: v_readlane_b32 s18, v43, 37 +; SI-NEXT: s_add_i32 s74, s4, 0x3000000 +; SI-NEXT: v_readlane_b32 s4, v44, 9 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v44, 8 +; SI-NEXT: v_readlane_b32 s6, v44, 7 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 54 +; SI-NEXT: v_readlane_b32 s17, v43, 38 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s62, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 52 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 51 -; SI-NEXT: v_readlane_b32 s18, v43, 49 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 50 -; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readlane_b32 s5, v44, 6 +; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s63, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 5 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 4 -; SI-NEXT: v_readlane_b32 s18, v43, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 2 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s72, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 1 +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s22, s17, s16 +; SI-NEXT: v_readlane_b32 s16, v43, 46 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 0 -; SI-NEXT: v_readlane_b32 s18, v43, 47 +; SI-NEXT: v_readlane_b32 s17, v43, 45 +; SI-NEXT: v_readlane_b32 s18, v43, 43 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s75, s4, 0x3000000 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 48 +; SI-NEXT: v_readlane_b32 s17, v43, 44 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 16 ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s73, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 13 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 12 -; SI-NEXT: v_readlane_b32 s18, v43, 11 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 10 -; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_writelane_b32 v43, s4, 53 ; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s74, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 9 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 8 -; SI-NEXT: v_readlane_b32 s18, v43, 7 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 6 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_addk_i32 s16, 0x300 -; SI-NEXT: s_lshl_b32 s17, s17, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s4, s4, 0x3000000 -; SI-NEXT: s_add_i32 s5, s5, 0x3000000 -; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_add_i32 s7, s7, 0x3000000 -; SI-NEXT: s_add_i32 s8, s8, 0x3000000 -; SI-NEXT: s_add_i32 s9, s9, 0x3000000 -; SI-NEXT: s_add_i32 s10, s10, 0x3000000 -; SI-NEXT: s_add_i32 s11, s11, 0x3000000 -; SI-NEXT: s_add_i32 s12, s12, 0x3000000 -; SI-NEXT: s_add_i32 s13, s13, 0x3000000 -; SI-NEXT: s_add_i32 s14, s14, 0x3000000 -; SI-NEXT: s_add_i32 s15, s15, 0x3000000 -; SI-NEXT: s_add_i32 s75, s16, 0x3000000 -; SI-NEXT: s_lshr_b64 s[76:77], s[74:75], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[48:49], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[50:51], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[54:55], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[64:65], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[26:27], s[72:73], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[60:61], 16 -; SI-NEXT: s_lshr_b64 s[20:21], s[58:59], 16 -; SI-NEXT: s_lshr_b64 s[24:25], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: v_writelane_b32 v43, s5, 54 +; SI-NEXT: s_lshr_b64 s[4:5], s[72:73], 16 +; SI-NEXT: s_or_b32 s40, s17, s16 +; SI-NEXT: s_add_i32 s67, s7, 0x3000000 +; SI-NEXT: s_add_i32 s78, s8, 0x3000000 +; SI-NEXT: s_add_i32 s79, s9, 0x3000000 +; SI-NEXT: s_add_i32 s54, s10, 0x3000000 +; SI-NEXT: s_add_i32 s55, s11, 0x3000000 +; SI-NEXT: s_add_i32 s50, s14, 0x3000000 +; SI-NEXT: s_add_i32 s51, s15, 0x3000000 +; SI-NEXT: s_add_i32 s76, s28, 0x3000000 +; SI-NEXT: s_add_i32 s77, s22, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s4, 55 +; SI-NEXT: s_add_i32 s52, s12, 0x3000000 +; SI-NEXT: s_add_i32 s53, s13, 0x3000000 +; SI-NEXT: s_add_i32 s42, s40, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s5, 56 +; SI-NEXT: s_lshr_b64 s[4:5], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[76:77], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[50:51], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[54:55], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[78:79], 16 +; SI-NEXT: s_lshr_b64 s[18:19], s[66:67], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[24:25], 16 +; SI-NEXT: v_writelane_b32 v43, s4, 57 +; SI-NEXT: s_lshr_b64 s[38:39], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[84:85], s[42:43], 16 -; SI-NEXT: s_lshr_b32 s55, s75, 16 -; SI-NEXT: s_lshr_b32 s53, s73, 16 -; SI-NEXT: s_lshr_b32 s51, s63, 16 -; SI-NEXT: s_lshr_b32 s49, s61, 16 -; SI-NEXT: s_lshr_b32 s86, s59, 16 -; SI-NEXT: s_lshr_b32 s39, s57, 16 -; SI-NEXT: s_lshr_b32 s18, s47, 16 -; SI-NEXT: s_lshr_b32 s22, s45, 16 -; SI-NEXT: s_lshr_b32 s97, s43, 16 -; SI-NEXT: s_lshr_b32 s65, s41, 16 -; SI-NEXT: s_lshr_b32 s19, s15, 16 -; SI-NEXT: s_lshr_b32 s71, s13, 16 -; SI-NEXT: s_lshr_b32 s67, s11, 16 -; SI-NEXT: s_lshr_b32 s80, s9, 16 -; SI-NEXT: s_lshr_b32 s81, s7, 16 -; SI-NEXT: s_lshr_b32 s77, s5, 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[52:53], 16 +; SI-NEXT: s_lshr_b32 s91, s75, 16 +; SI-NEXT: s_lshr_b32 s29, s73, 16 +; SI-NEXT: s_lshr_b32 s27, s63, 16 +; SI-NEXT: s_lshr_b32 s23, s61, 16 +; SI-NEXT: s_lshr_b32 s21, s59, 16 +; SI-NEXT: s_lshr_b32 s19, s57, 16 +; SI-NEXT: s_lshr_b32 s17, s47, 16 +; SI-NEXT: s_lshr_b32 s89, s45, 16 +; SI-NEXT: s_lshr_b32 s68, s43, 16 +; SI-NEXT: s_lshr_b32 s40, s77, 16 +; SI-NEXT: s_lshr_b32 s28, s51, 16 +; SI-NEXT: s_lshr_b32 s14, s53, 16 +; SI-NEXT: s_lshr_b32 s12, s55, 16 +; SI-NEXT: s_lshr_b32 s10, s79, 16 +; SI-NEXT: s_lshr_b32 s8, s67, 16 +; SI-NEXT: s_lshr_b32 s6, s25, 16 +; SI-NEXT: v_writelane_b32 v43, s5, 58 ; SI-NEXT: .LBB97_3: ; %end -; SI-NEXT: s_and_b32 s16, s74, 0xffff -; SI-NEXT: s_lshl_b32 s17, s76, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v1, s16 -; SI-NEXT: s_and_b32 s16, s75, 0xffff -; SI-NEXT: s_lshl_b32 s17, s55, 16 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s72, 0xffff -; SI-NEXT: s_lshl_b32 s17, s26, 16 +; SI-NEXT: v_readlane_b32 vcc_lo, v43, 53 +; SI-NEXT: s_and_b32 s4, s74, 0xffff +; SI-NEXT: s_lshl_b32 s5, vcc_lo, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: s_and_b32 s4, s75, 0xffff +; SI-NEXT: s_lshl_b32 s5, s91, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s74, v43, 55 +; SI-NEXT: v_readlane_b32 vcc_hi, v43, 54 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s72, 0xffff +; SI-NEXT: s_lshl_b32 s5, s74, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s73, 0xffff -; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s73, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s72, v43, 57 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s62, 0xffff -; SI-NEXT: s_lshl_b32 s17, s28, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s62, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s63, 0xffff -; SI-NEXT: s_lshl_b32 s17, s51, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s63, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s60, 0xffff -; SI-NEXT: s_lshl_b32 s17, s88, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s60, 0xffff +; SI-NEXT: s_lshl_b32 s5, s22, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s61, 0xffff -; SI-NEXT: s_lshl_b32 s17, s49, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s61, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s58, 0xffff -; SI-NEXT: s_lshl_b32 s17, s20, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s58, 0xffff +; SI-NEXT: s_lshl_b32 s5, s38, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s59, 0xffff -; SI-NEXT: s_lshl_b32 s17, s86, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s59, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s56, 0xffff -; SI-NEXT: s_lshl_b32 s17, s24, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s56, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s57, 0xffff -; SI-NEXT: s_lshl_b32 s17, s39, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s57, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s46, 0xffff -; SI-NEXT: s_lshl_b32 s17, s34, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s46, 0xffff +; SI-NEXT: s_lshl_b32 s5, s30, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s47, 0xffff -; SI-NEXT: s_lshl_b32 s17, s18, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s47, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s44, 0xffff -; SI-NEXT: s_lshl_b32 s17, s94, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s44, 0xffff +; SI-NEXT: s_lshl_b32 s5, s94, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s45, 0xffff -; SI-NEXT: s_lshl_b32 s17, s22, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s45, 0xffff +; SI-NEXT: s_lshl_b32 s5, s89, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s42, 0xffff -; SI-NEXT: s_lshl_b32 s17, s84, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s43, 0xffff -; SI-NEXT: s_lshl_b32 s17, s97, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s43, 0xffff +; SI-NEXT: s_lshl_b32 s5, s68, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s40, 0xffff -; SI-NEXT: s_lshl_b32 s17, s38, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s76, 0xffff +; SI-NEXT: s_lshl_b32 s5, s90, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s41, 0xffff -; SI-NEXT: s_lshl_b32 s17, s65, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s77, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s16, s48, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s50, 0xffff +; SI-NEXT: s_lshl_b32 s5, s26, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 -; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s19, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s51, 0xffff +; SI-NEXT: s_lshl_b32 s5, s28, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s14, s50, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s52, 0xffff +; SI-NEXT: s_lshl_b32 s5, s36, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 -; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s71, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s53, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s12, s52, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s54, 0xffff +; SI-NEXT: s_lshl_b32 s5, s20, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 -; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xffff -; SI-NEXT: s_lshl_b32 s11, s67, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s55, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s10, s54, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s78, 0xffff +; SI-NEXT: s_lshl_b32 s5, s88, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 -; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s8, s9, 0xffff -; SI-NEXT: s_lshl_b32 s9, s80, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s79, 0xffff +; SI-NEXT: s_lshl_b32 s5, s10, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s8, s64, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s66, 0xffff +; SI-NEXT: s_lshl_b32 s5, s18, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 -; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s6, s7, 0xffff -; SI-NEXT: s_lshl_b32 s7, s81, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s67, 0xffff +; SI-NEXT: s_lshl_b32 s5, s8, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s6, s66, 16 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s16, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 -; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: s_and_b32 s4, s25, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -204960,6 +204397,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s75, v43, 56 +; SI-NEXT: v_readlane_b32 s73, v43, 58 ; SI-NEXT: v_readlane_b32 s99, v41, 35 ; SI-NEXT: v_readlane_b32 s98, v41, 34 ; SI-NEXT: v_readlane_b32 s97, v41, 33 @@ -205000,58 +204439,65 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v43, s4, 53 +; SI-NEXT: v_writelane_b32 v43, s5, 54 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v43, s4, 55 +; SI-NEXT: v_writelane_b32 v43, s5, 56 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v43, s4, 57 +; SI-NEXT: v_writelane_b32 v43, s5, 58 ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr91 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr23 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr21 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr39 -; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr81 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v128i8_to_v64i16_scalar: @@ -205073,19 +204519,19 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill @@ -205115,10 +204561,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v44, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v29 ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 @@ -205127,46 +204573,42 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v41, 8, v26 +; VI-NEXT: v_lshlrev_b32_e32 v40, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v30 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v31 +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 @@ -205175,55 +204617,40 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v36 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v38 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -205234,805 +204661,824 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:36 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 -; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB97_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff ; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_and_b32 s4, s20, 0xff +; VI-NEXT: s_lshl_b32 s5, s21, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff ; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s5, s5, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s24, 0xff +; VI-NEXT: s_lshl_b32 s5, s25, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s8, s4, s5 +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v2, v8 -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v44, v8 +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v45, v10 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v3, v10 +; VI-NEXT: v_or_b32_sdwa v1, v1, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v38, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v39, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v51, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v53, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_or_b32_sdwa v1, v63, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_or_b32_sdwa v0, v39, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_or_b32_sdwa v1, v40, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v42, v43 -; VI-NEXT: v_mov_b32_e32 v43, v37 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v47, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v47, v54 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_or_b32_sdwa v0, v41, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v51, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v52, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v35, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v61, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v58, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v36, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v34, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v55, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v28, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v25, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v58, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v26, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v29, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v46, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v28, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v57, v38 +; VI-NEXT: v_or_b32_sdwa v0, v38, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v40, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v59, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v51, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v41 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v46, v1 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v29, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v56, v1 -; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v47, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v39 -; VI-NEXT: v_mov_b32_e32 v54, v33 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v57, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v43, v62 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v52, v60 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v54, v63 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v53, v35 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 -; VI-NEXT: s_and_b32 s4, s16, 0xff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff -; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_branch .LBB97_3 ; VI-NEXT: .LBB97_2: -; VI-NEXT: v_mov_b32_e32 v47, v54 -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v58, v7 -; VI-NEXT: v_mov_b32_e32 v57, v5 -; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: v_mov_b32_e32 v57, v38 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB97_3: ; %Flow -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; VI-NEXT: s_cbranch_vccnz .LBB97_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_lshl_b32 s5, s27, 8 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s6, s27, 8 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_lshl_b32 s6, s25, 8 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s8, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_lshl_b32 s8, s21, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_and_b32 s8, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s21, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_lshl_b32 s9, s19, 8 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s9, s18, 0xff +; VI-NEXT: s_lshl_b32 s10, s19, 8 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v29, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: s_or_b32 s9, s10, s9 +; VI-NEXT: s_and_b32 s10, s16, 0xff +; VI-NEXT: s_lshl_b32 s11, s17, 8 +; VI-NEXT: s_or_b32 s10, s11, s10 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_addk_i32 s8, 0x300 +; VI-NEXT: s_addk_i32 s10, 0x300 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_and_b32 s10, s10, 0xffff +; VI-NEXT: s_and_b32 s8, s8, 0xffff +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_add_i32 s9, s9, 0x3000000 +; VI-NEXT: s_add_i32 s7, s7, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 -; VI-NEXT: v_or_b32_sdwa v30, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 -; VI-NEXT: v_or_b32_sdwa v28, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v44, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v27, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v63 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v40, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v30, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v26, v61, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v34, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 -; VI-NEXT: v_or_b32_sdwa v26, v26, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_or_b32_sdwa v29, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v13, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v28, v53, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v21, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v21 -; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v14, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v24, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v27, v49, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v32, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 -; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v15, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v23, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v26, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 +; VI-NEXT: v_or_b32_sdwa v37, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: v_or_b32_sdwa v26, v26, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v25, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v38, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v24, v36, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v46 +; VI-NEXT: v_or_b32_sdwa v39, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v15 +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v14 +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 +; VI-NEXT: v_or_b32_sdwa v24, v24, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v25, v25, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v27, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v28, v28, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v61, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v61 -; VI-NEXT: v_or_b32_sdwa v23, v23, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v23, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v13 +; VI-NEXT: v_or_b32_sdwa v29, v29, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v48, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: v_or_b32_sdwa v23, v23, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v36, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 -; VI-NEXT: v_or_b32_sdwa v22, v22, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v22, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v49, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 +; VI-NEXT: v_or_b32_sdwa v22, v22, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v21, v60, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v63, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v38, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 -; VI-NEXT: v_or_b32_sdwa v21, v63, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v50, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_or_b32_sdwa v20, v45, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 +; VI-NEXT: v_or_b32_sdwa v51, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v40 +; VI-NEXT: v_or_b32_sdwa v19, v43, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v52, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v52 +; VI-NEXT: v_or_b32_sdwa v19, v19, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v20, v20, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v21, v21, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 -; VI-NEXT: v_or_b32_sdwa v39, v45, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 -; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 +; VI-NEXT: v_or_b32_sdwa v18, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v19, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v48, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 -; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 -; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v62, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v62 -; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v53, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x300, v53 +; VI-NEXT: v_or_b32_sdwa v18, v18, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v51 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v50 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v17, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v49, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 -; VI-NEXT: v_or_b32_sdwa v15, v15, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_or_b32_sdwa v54, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x300, v54 +; VI-NEXT: v_or_b32_sdwa v17, v17, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v17 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v16, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 -; VI-NEXT: v_or_b32_sdwa v14, v14, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v2 -; VI-NEXT: v_or_b32_sdwa v29, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 +; VI-NEXT: v_or_b32_sdwa v55, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v55, vcc, 0x300, v55 +; VI-NEXT: v_or_b32_sdwa v16, v16, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v16 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v13, v59, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v52, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v52 -; VI-NEXT: v_or_b32_sdwa v13, v13, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v44 -; VI-NEXT: v_or_b32_sdwa v28, v28, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v40, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v41, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v41 +; VI-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v1 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_or_b32_sdwa v42, v58, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x300, v42 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v54, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v54, vcc, 0x300, v54 -; VI-NEXT: v_or_b32_sdwa v12, v12, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_or_b32_sdwa v12, v47, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_or_b32_sdwa v43, v56, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v40, vcc, 0x300, v43 +; VI-NEXT: v_or_b32_sdwa v12, v12, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v50, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_or_b32_sdwa v11, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v44, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v44 +; VI-NEXT: v_or_b32_sdwa v11, v11, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v53, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v10, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v55, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v45, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v10, v10, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v9, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 -; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v10 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 -; VI-NEXT: v_or_b32_sdwa v49, v16, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v53, vcc, 0x300, v40 -; VI-NEXT: v_or_b32_sdwa v27, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v46, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v46 +; VI-NEXT: v_or_b32_sdwa v9, v9, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v8, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v43, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v43 -; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v11 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v41 -; VI-NEXT: v_or_b32_sdwa v17, v17, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v11, v50, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v49 -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v0 -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v30, v30, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v47, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v44, vcc, 0x300, v47 +; VI-NEXT: v_or_b32_sdwa v8, v8, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v7, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v56, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v56 ; VI-NEXT: v_or_b32_sdwa v7, v7, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v6, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_or_b32_sdwa v57, v32, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v57 ; VI-NEXT: v_or_b32_sdwa v6, v6, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v5, v32, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v2 +; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v47, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v58, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v3 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v47, vcc, 0x300, v47 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v47, vcc, 0x300, v58 ; VI-NEXT: v_or_b32_sdwa v5, v5, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v30, v30, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 -; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v56, vcc, 0x300, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v4, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v56, vcc, 3, v56 -; VI-NEXT: v_or_b32_sdwa v56, v57, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v56, s4, v56 -; VI-NEXT: s_and_b32 s4, s26, 0xff -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_and_b32 s5, s24, 0xff -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_and_b32 s7, s20, 0xff -; VI-NEXT: s_or_b32 s7, s8, s7 -; VI-NEXT: s_and_b32 s8, s18, 0xff -; VI-NEXT: s_or_b32 s8, s9, s8 -; VI-NEXT: s_and_b32 s9, s16, 0xff -; VI-NEXT: s_or_b32 s9, s10, s9 -; VI-NEXT: s_addk_i32 s5, 0x300 -; VI-NEXT: s_addk_i32 s7, 0x300 -; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_lshl_b32 s8, s8, 16 -; VI-NEXT: s_and_b32 s9, s9, 0xffff -; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_add_i32 s8, s8, 0x3000000 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v56 -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v57, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v3, s4, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v3 ; VI-NEXT: .LBB97_5: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -206072,35 +205518,31 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 @@ -206111,268 +205553,284 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v29 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 ; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v56 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v52 ; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v45 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v51 ; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v55 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v53 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v20 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v26 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v28 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 -; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v24 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(29) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:248 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 -; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v37 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:312 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:328 -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:36 ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:44 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v1 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:100 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:68 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:108 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(55) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(60) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB97_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -206380,722 +205838,725 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v3, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s21, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v57, v5 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v34, v35 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s4, s5 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v46, v32 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v17, v45, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v45, v59 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v55, v22 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v16, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v47, v32 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_mov_b32_e32 v33, v35 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v51, v57 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v49, v39 -; GFX9-NEXT: v_mov_b32_e32 v59, v44 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v58, v50 -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v54, v63 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v41, v39 +; GFX9-NEXT: v_mov_b32_e32 v39, v54 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v52, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v54, v32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v37, v29 +; GFX9-NEXT: v_mov_b32_e32 v34, v37 +; GFX9-NEXT: v_mov_b32_e32 v37, v53 +; GFX9-NEXT: v_mov_b32_e32 v53, v36 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v48, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v38, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v36, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v62, v61 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v59, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v57, v35 -; GFX9-NEXT: v_mov_b32_e32 v35, v38 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: s_branch .LBB97_3 ; GFX9-NEXT: .LBB97_2: -; GFX9-NEXT: v_mov_b32_e32 v58, v50 -; GFX9-NEXT: v_mov_b32_e32 v45, v59 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v34, v35 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v49, v39 -; GFX9-NEXT: v_mov_b32_e32 v55, v22 -; GFX9-NEXT: v_mov_b32_e32 v51, v5 +; GFX9-NEXT: v_mov_b32_e32 v58, v39 +; GFX9-NEXT: v_mov_b32_e32 v49, v40 +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v46, v32 +; GFX9-NEXT: v_mov_b32_e32 v35, v32 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB97_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB97_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v16, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v16, v45, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v15, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v15, v46, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_addk_i32 s4, 0x300 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s24, s24, 3 -; GFX9-NEXT: s_lshl_b32 s5, s25, 8 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s25, 8 ; GFX9-NEXT: s_add_i32 s26, s26, 3 -; GFX9-NEXT: s_lshl_b32 s6, s27, 8 -; GFX9-NEXT: s_add_i32 s20, s20, 3 -; GFX9-NEXT: s_lshl_b32 s7, s21, 8 -; GFX9-NEXT: s_add_i32 s22, s22, 3 -; GFX9-NEXT: s_lshl_b32 s8, s23, 8 -; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_lshl_b32 s9, s17, 8 -; GFX9-NEXT: s_add_i32 s18, s18, 3 -; GFX9-NEXT: s_lshl_b32 s10, s19, 8 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 -; GFX9-NEXT: v_or_b32_sdwa v23, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 -; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: s_and_b32 s4, s24, 0xff -; GFX9-NEXT: s_or_b32 s4, s5, s4 -; GFX9-NEXT: s_and_b32 s5, s26, 0xff ; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_and_b32 s6, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s27, 8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 ; GFX9-NEXT: s_or_b32 s6, s7, s6 -; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_and_b32 s7, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 ; GFX9-NEXT: s_or_b32 s7, s8, s7 -; GFX9-NEXT: s_and_b32 s8, s16, 0xff +; GFX9-NEXT: s_and_b32 s8, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s23, 8 +; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_or_b32 s8, s9, s8 -; GFX9-NEXT: s_and_b32 s9, s18, 0xff +; GFX9-NEXT: s_and_b32 s9, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s17, 8 +; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_or_b32 s9, s10, s9 -; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_and_b32 s10, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s19, 8 +; GFX9-NEXT: s_or_b32 s10, s11, s10 ; GFX9-NEXT: s_addk_i32 s5, 0x300 ; GFX9-NEXT: s_addk_i32 s6, 0x300 ; GFX9-NEXT: s_addk_i32 s7, 0x300 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_addk_i32 s9, 0x300 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_addk_i32 s10, 0x300 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v22, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v4, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v24, v59, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v16 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v25, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v7, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v8, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v28, v48, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v28 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v9, 3, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v37, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 +; GFX9-NEXT: v_or_b32_sdwa v38, v38, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v10, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v37, v37, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v11, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v39, v39, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v12, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v48, v52, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v48 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v13, v51, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v14, 3, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v38, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v14, v47, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v17, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v17, v50, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v18, 3, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v39, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v18, v43, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v48, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v19, v44, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v20, 3, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v20, v42, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v50, v34, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v51, v33, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v52, v63, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v53 +; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v25 +; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v14 +; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v23 +; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v18 +; GFX9-NEXT: v_add_u32_e32 v50, 0x300, v50 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v53, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v54, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v40, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v41, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v42, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v2 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v49, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v43, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v43 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v58 -; GFX9-NEXT: v_or_b32_sdwa v19, v51, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v51, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: v_or_b32_sdwa v44, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v52, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v45, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v62 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v46, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v47, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v56 +; GFX9-NEXT: v_add_u32_e32 v49, 0x300, v19 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v55 +; GFX9-NEXT: v_add_u32_e32 v55, 0x300, v44 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v46 +; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v47 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v56, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v58 +; GFX9-NEXT: v_or_b32_sdwa v57, v57, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v35 +; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v56 +; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v27 +; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v37 +; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v39 +; GFX9-NEXT: v_add_u32_e32 v39, 0x300, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v58, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v15, 0x300, v58 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v59, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v21, 3, v36 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v38 +; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v13 +; GFX9-NEXT: v_add_u32_e32 v13, 0x300, v57 +; GFX9-NEXT: v_add_u32_e32 v59, 0x300, v59 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v34 -; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v35, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 -; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 +; GFX9-NEXT: v_or_b32_sdwa v60, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v60 -; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 -; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 -; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v41, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v46 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v36, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v22 -; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v52 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v42, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_or_b32_sdwa v24, v57, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 -; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 -; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 -; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 -; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 +; GFX9-NEXT: v_or_b32_sdwa v61, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v58, 0x300, v61 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 -; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v62, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v41, 0x300, v62 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v63, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v57, 0x300, v63 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v21, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v45, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v23 -; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v25 -; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v38 -; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v50 -; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v39 -; GFX9-NEXT: v_add_u32_e32 v39, 0x300, v49 -; GFX9-NEXT: v_add_u32_e32 v49, 0x300, v53 -; GFX9-NEXT: v_add_u32_e32 v50, 0x300, v55 -; GFX9-NEXT: v_add_u32_e32 v53, 0x300, v45 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v56, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v5 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v43, 0x300, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v7 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v44, 0x300, v6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v8 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v7 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v9 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v45, 0x300, v8 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v7, v45, 16, v7 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v10 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v9 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v11 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v46, 0x300, v10 +; GFX9-NEXT: v_lshl_or_b32 v6, v46, 16, v6 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v9 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v12 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v22 +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v51 +; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v52 +; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v54 +; GFX9-NEXT: v_add_u32_e32 v54, 0x300, v42 +; GFX9-NEXT: v_add_u32_e32 v42, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v5 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v8, v44, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; GFX9-NEXT: v_lshl_or_b32 v9, v43, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; GFX9-NEXT: v_lshl_or_b32 v10, v42, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v58 +; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v20 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v53 +; GFX9-NEXT: v_add_u32_e32 v53, 0x300, v40 +; GFX9-NEXT: v_add_u32_e32 v40, 0x300, v60 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v41, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; GFX9-NEXT: v_add_u32_e32 v47, 0x300, v12 +; GFX9-NEXT: v_lshl_or_b32 v12, v40, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v5, v47, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_lshl_or_b32 v16, v55, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v17, v54, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 -; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 +; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 ; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 ; GFX9-NEXT: v_lshl_or_b32 v26, v37, 16, v26 ; GFX9-NEXT: v_lshl_or_b32 v27, v36, 16, v27 +; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 +; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 ; GFX9-NEXT: .LBB97_5: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -207326,14 +206787,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB97_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff -; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff -; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_and_b32 v1, 0xff, v35 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8 @@ -207343,186 +206799,170 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s5, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s7, s8 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-TRUE16-NEXT: s_and_b32 s9, s28, 0xff +; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s29, 8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v64 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v66 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v65 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v35 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v118 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v67 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v68 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v69 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v67 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v68 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v69 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v1, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v48 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v80 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v50 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v80 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v81 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v3, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v83 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v81 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v2, v71 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v51 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v83 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v3, v86 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v84 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v1.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v85 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v96 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v85 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v3, v86 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v54 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v98 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v87 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v99 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v97 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v102 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v101 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v1, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v114 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v113 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v112 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v116 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v128 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v134 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v133 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v132 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v133 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v134 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v161 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v129 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v147 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v166 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v144 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v147 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v3, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v148 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v149 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v151 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v2, v166 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v177 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v180 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v149 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v165 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v42 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v180 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v115 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v178 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v1, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v119 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v44 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v45 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v44 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v3, v45 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v56 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v135 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v145 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v59 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v56 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v60 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v63 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v61 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v3, v60 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v150 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v160 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v62 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v2, v63 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v72 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v73 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v160 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v176 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v75 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v1, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v176 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v179 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v181 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v75 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v76 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v77 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v3, v77 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v40 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v43 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v78 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v79 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v89 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v40 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v91 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v88 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v57 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v3, v89 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v47 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v58 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v93 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v3, v91 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v92 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v57 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v93 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v0.l -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v1, v92 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v2.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB97_3 ; GFX11-TRUE16-NEXT: .LBB97_2: ; %cmp.true @@ -208098,233 +207538,211 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB97_4 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff -; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 -; GFX11-FAKE16-NEXT: v_and_b32_e64 v5, 0xffff, s5 ; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 +; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8 -; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8 -; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10 -; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8 -; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff -; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12 +; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff +; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8 +; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9 +; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s24, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s25, 8 +; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s11 +; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff +; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-FAKE16-NEXT: s_or_b32 s11, s11, s12 +; GFX11-FAKE16-NEXT: s_and_b32 s10, s26, 0xff +; GFX11-FAKE16-NEXT: v_and_b32_e64 v3, 0xffff, s11 +; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s27, 8 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v66 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v68 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v64 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v66 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v4, v67 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v64 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v36 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v65 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v49 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v6, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v37 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v71 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v6, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v1, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v50 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v53 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v85 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v55 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v81 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v84 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v86 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v9, v83 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v100 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v96 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v11, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v99 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v12, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v97 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v98 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v101 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v13, v112 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v12, v102 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v116 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v14, 16, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v119 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v15, 16, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v133 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v128 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v117 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v134 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v129 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v15, v132 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v16, v147 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v17, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v165 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v149 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v162 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v42 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v18, v177 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v19, v41 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v20, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v21, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v131 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v145 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v118 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v59 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v135 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v60 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v21, v56 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v22, v61 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v23, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v24, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v163 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v176 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v160 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v73 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v164 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v75 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v24, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v25, v74 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v26, 16, v2 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v27, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v183 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v43 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v182 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v78 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v40 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v89 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v27, v79 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v28, v88 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v58 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v29, 16, v2 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v30, 16, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v46 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v57 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v92 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v90 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v2, 16, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s5 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v3, 16, v1 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB97_3 ; GFX11-FAKE16-NEXT: .LBB97_2: ; %cmp.true @@ -208773,21 +208191,21 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill @@ -208917,21 +208335,18 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 @@ -209000,15 +208415,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 @@ -209029,8 +208444,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -209042,53 +208457,51 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v59 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v58 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -209096,67 +208509,72 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v47 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v47 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v60, 0xffff, v61 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -209167,7 +208585,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v45, v2, v6 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -209178,7 +208596,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v43, v2, v6 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -209189,7 +208607,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v41, v2, v6 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -209228,14 +208646,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v50, v2, v6 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v49, v2, v6 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v49, v2, v27 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v48, v2, v6 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload @@ -209251,9 +208670,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v37, v2, v29 +; SI-NEXT: v_or_b32_e32 v37, v2, v15 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v36, v2, v6 @@ -209266,32 +208685,32 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v34, v2, v6 +; SI-NEXT: v_or_b32_e32 v33, v2, v6 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v33, v2, v11 +; SI-NEXT: v_or_b32_e32 v34, v2, v11 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v31, v2, v6 +; SI-NEXT: v_or_b32_e32 v31, v2, v23 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v32, v2, v7 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v26, v2, v15 +; SI-NEXT: v_or_b32_e32 v26, v2, v29 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v30, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -209301,22 +208720,22 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v22, v2, v5 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v14, v2, v17 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v10, v2, v9 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v6, v2, v13 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -209328,13 +208747,13 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill @@ -209342,20 +208761,17 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v4, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill @@ -209367,7 +208783,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v16, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v20, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill @@ -209376,46 +208792,49 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v28, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v60, 8, 8 +; SI-NEXT: v_bfe_u32 v1, v59, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v59, 8, 8 +; SI-NEXT: v_bfe_u32 v1, v47, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v63, 8, 8 +; SI-NEXT: v_bfe_u32 v1, v58, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v57, 8, 8 +; SI-NEXT: v_bfe_u32 v1, v63, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v58, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_bfe_u32 v1, v57, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v56, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v61, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v27, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v62, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v47, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: v_bfe_u32 v1, v61, 8, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v45, v46, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v45, v46, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill @@ -209448,7 +208867,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v55, v40, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill @@ -209457,7 +208876,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v51, v52, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill @@ -209472,7 +208891,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v49, v50, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v49, v50, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill @@ -209504,59 +208923,59 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v1, v35, v36, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v30, v26, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v30, v26, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v30, v26, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v22, v18, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v22, v18, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v22, v18, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v10, v14, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v10, v14, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v10, v14, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v2, v6, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v2, v6, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v2, v6, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v45 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill @@ -209588,7 +209007,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v35 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32 @@ -209598,7 +209017,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill @@ -209708,74 +209127,73 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: .LBB98_2: ; %Flow ; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b32 s6, 0x30000 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v8, v9, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_or_b32_e32 v2, v13, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v8, v9, v6 +; SI-NEXT: v_or_b32_e32 v9, v21, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v8 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v9 ; SI-NEXT: v_alignbit_b32 v12, v2, v6, 24 -; SI-NEXT: v_alignbit_b32 v20, v2, v6, 16 -; SI-NEXT: v_alignbit_b32 v47, v2, v6, 8 -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v10 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; SI-NEXT: v_alignbit_b32 v16, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v27, v2, v6, 8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v5, v1 ; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v62, v22, v18, 24 -; SI-NEXT: v_alignbit_b32 v63, v22, v18, 16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v22 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v61, v22, v18, 24 +; SI-NEXT: v_alignbit_b32 v62, v22, v18, 16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 ; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -209784,36 +209202,33 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v7, v1 ; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v1 +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v34 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -209827,6 +209242,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -209837,7 +209253,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -209859,27 +209275,30 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v17, v4 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v4 -; SI-NEXT: v_alignbit_b32 v56, v10, v14, 24 -; SI-NEXT: v_alignbit_b32 v57, v10, v14, 16 -; SI-NEXT: v_alignbit_b32 v61, v10, v14, 8 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v28, v10, v14, 24 +; SI-NEXT: v_alignbit_b32 v47, v10, v14, 16 +; SI-NEXT: v_alignbit_b32 v59, v10, v14, 8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -209895,7 +209314,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -209903,7 +209322,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -209911,7 +209330,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -209919,7 +209338,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -209927,7 +209346,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -209935,7 +209354,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -209943,7 +209362,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -209951,7 +209370,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -209959,7 +209378,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -209997,7 +209416,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v55, v40, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v53, v54, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill @@ -210006,7 +209425,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v53, v54, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v51, v52, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill @@ -210021,7 +209440,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v49, v50, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v49, v50, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill @@ -210053,35 +209472,35 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v1, v35, v36, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 24 +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 16 +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v33, v34, 8 +; SI-NEXT: v_alignbit_b32 v1, v34, v33, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v30, v26, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v30, v26, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v30, v26, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v22, v18, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v45 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill @@ -210105,13 +209524,13 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill @@ -210123,7 +209542,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v53 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill @@ -210138,7 +209557,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill @@ -210159,7 +209578,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill @@ -210167,20 +209586,20 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v35 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v35 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v34 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill @@ -210189,13 +209608,13 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 ; SI-NEXT: v_alignbit_b32 v4, v45, v46, 24 @@ -210300,7 +209719,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -210311,7 +209730,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload @@ -210332,7 +209751,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -210347,7 +209766,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload @@ -210372,7 +209791,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -210426,7 +209845,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -210440,7 +209859,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -210516,7 +209935,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -210546,26 +209965,24 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v63 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 @@ -210584,11 +210001,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v58 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -210600,11 +210017,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -210616,29 +210033,31 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -210656,7 +210075,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -210667,28 +210086,28 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v62 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v61 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v63 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v62 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v59 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v57 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 @@ -210697,11 +210116,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v59 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v47 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v56 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -210710,11 +210129,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 @@ -210723,9 +210142,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -210741,7 +210160,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -210769,23 +210188,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v64i16_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -210802,1185 +210204,1219 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v30 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; kill: killed $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_mov_b32_e32 v55, v0 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v40 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v39 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v37 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 ; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; kill: killed $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; kill: killed $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; kill: killed $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; kill: killed $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; kill: killed $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; kill: killed $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB98_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[15:16] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[13:14] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[11:12] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[9:10] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v31, v7 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v10 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v11 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v12 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v13 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v14 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v15 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v8 -; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v7, v5 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v7, v6 -; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v36 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v37 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v30 +; VI-NEXT: v_lshrrev_b64 v[36:37], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v1 +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v2 +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[3:4] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v4 +; VI-NEXT: v_mov_b32_e32 v1, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v4 +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[5:6] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v6 +; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v6 +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[7:8] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v8 +; VI-NEXT: v_mov_b32_e32 v1, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v8 +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v9 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[9:10] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v10 +; VI-NEXT: v_mov_b32_e32 v1, v9 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v10 +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[11:12] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v12 +; VI-NEXT: v_mov_b32_e32 v1, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v12 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v12 +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v13 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[13:14] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v14 +; VI-NEXT: v_mov_b32_e32 v1, v13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v14 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v14 +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v15 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[15:16] +; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v16 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v16 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v17 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[17:18] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v18 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[19:20] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v20 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[23:24] +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[21:22] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v22 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v20 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v19 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v20 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v17 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v18 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v5, v4 -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v46 -; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v26 -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v24 -; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v22 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v20 -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v17 -; VI-NEXT: v_mov_b32_e32 v46, v1 -; VI-NEXT: ; implicit-def: $vgpr1 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr7 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: .LBB98_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB98_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v31, 3 -; VI-NEXT: v_add_u16_sdwa v55, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v32, 3, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 -; VI-NEXT: v_add_u16_sdwa v54, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v18, v32, v18 -; VI-NEXT: v_add_u16_e32 v32, 3, v17 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v54 -; VI-NEXT: v_add_u16_sdwa v38, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v17, v32, v17 -; VI-NEXT: v_add_u16_e32 v32, 3, v20 -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v38 -; VI-NEXT: v_add_u16_sdwa v62, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v20, v32, v20 -; VI-NEXT: v_add_u16_e32 v32, 3, v19 -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v62 -; VI-NEXT: v_add_u16_sdwa v48, v22, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v19, v32, v19 -; VI-NEXT: v_add_u16_e32 v32, 3, v22 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 -; VI-NEXT: v_add_u16_sdwa v53, v21, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v22, v32, v22 -; VI-NEXT: v_add_u16_e32 v32, 3, v21 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; VI-NEXT: v_add_u16_sdwa v61, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v21, v32, v21 -; VI-NEXT: v_add_u16_e32 v32, 3, v24 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 -; VI-NEXT: v_add_u16_sdwa v49, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v24, v32, v24 -; VI-NEXT: v_add_u16_e32 v32, 3, v23 -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 -; VI-NEXT: v_add_u16_sdwa v58, v26, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v23, v32, v23 -; VI-NEXT: v_add_u16_e32 v32, 3, v26 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v26, v32, v26 -; VI-NEXT: v_add_u16_e32 v32, 3, v25 -; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_add_u16_sdwa v39, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v25, v32, v25 -; VI-NEXT: v_add_u16_e32 v32, 3, v28 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v28, v32, v28 -; VI-NEXT: v_add_u16_e32 v32, 3, v27 -; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_add_u16_sdwa v35, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v27, v32, v27 -; VI-NEXT: v_add_u16_e32 v33, 3, v30 -; VI-NEXT: v_add_u16_e32 v34, 3, v29 -; VI-NEXT: v_add_u16_sdwa v32, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 -; VI-NEXT: v_add_u16_sdwa v52, v37, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v30, v33, v29 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 -; VI-NEXT: v_add_u16_e32 v33, 3, v37 -; VI-NEXT: v_add_u16_sdwa v50, v36, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v29, v34, v29 -; VI-NEXT: v_add_u16_e32 v34, 3, v36 -; VI-NEXT: v_or_b32_e32 v37, v33, v32 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v50 -; VI-NEXT: v_add_u16_sdwa v57, v2, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v36, v34, v32 -; VI-NEXT: v_add_u16_e32 v33, 3, v2 -; VI-NEXT: v_add_u16_e32 v34, 3, v1 -; VI-NEXT: v_add_u16_sdwa v32, v1, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; VI-NEXT: v_or_b32_e32 v2, v33, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; VI-NEXT: v_add_u16_sdwa v56, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v1, v34, v1 -; VI-NEXT: v_add_u16_e32 v33, 3, v4 -; VI-NEXT: v_add_u16_e32 v34, 3, v3 -; VI-NEXT: v_add_u16_sdwa v32, v3, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; VI-NEXT: v_or_b32_e32 v4, v33, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; VI-NEXT: v_add_u16_sdwa v47, v6, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v3, v34, v3 -; VI-NEXT: v_add_u16_e32 v33, 3, v6 -; VI-NEXT: v_add_u16_e32 v34, 3, v5 -; VI-NEXT: v_add_u16_sdwa v32, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; VI-NEXT: v_or_b32_e32 v6, v33, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v5, v34, v5 -; VI-NEXT: v_add_u16_sdwa v34, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_add_u16_e32 v40, 3, v8 -; VI-NEXT: v_add_u16_e32 v33, 3, v7 -; VI-NEXT: v_add_u16_sdwa v32, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; VI-NEXT: v_or_b32_e32 v8, v40, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; VI-NEXT: v_add_u16_sdwa v59, v10, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v7, v33, v7 -; VI-NEXT: v_add_u16_e32 v33, 3, v10 -; VI-NEXT: v_add_u16_e32 v40, 3, v9 -; VI-NEXT: v_add_u16_sdwa v32, v9, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 -; VI-NEXT: v_or_b32_e32 v10, v33, v9 -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; VI-NEXT: v_add_u16_sdwa v63, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v9, v40, v9 -; VI-NEXT: v_add_u16_e32 v33, 3, v12 -; VI-NEXT: v_add_u16_e32 v40, 3, v11 -; VI-NEXT: v_add_u16_sdwa v32, v11, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v12, v33, v11 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; VI-NEXT: v_add_u16_sdwa v33, v14, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v11, v40, v11 -; VI-NEXT: v_add_u16_e32 v40, 3, v14 -; VI-NEXT: v_add_u16_e32 v41, 3, v13 -; VI-NEXT: v_add_u16_sdwa v32, v13, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 -; VI-NEXT: v_add_u16_sdwa v60, v16, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v40, v13 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; VI-NEXT: v_add_u16_sdwa v31, v15, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v16, 3, v16 -; VI-NEXT: v_add_u16_e32 v32, 3, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v16, v16, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v15, v32, v15 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v16 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v15 -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[15:16] -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v13, v41, v13 -; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v14 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v13 -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[13:14] -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v9 -; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 -; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 -; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[23:24] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v20 +; VI-NEXT: v_mov_b32_e32 v1, v26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 +; VI-NEXT: v_mov_b32_e32 v1, v28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v39 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v60, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v33, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v63, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v59, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v34, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v47, 8, 8 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v56, 8, 8 -; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v57, 8, 8 -; VI-NEXT: v_mov_b32_e32 v46, v35 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v52, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v46, 8, 8 -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v39, 8, 8 -; VI-NEXT: v_mov_b32_e32 v51, v49 -; VI-NEXT: v_mov_b32_e32 v49, v53 -; VI-NEXT: v_mov_b32_e32 v53, v38 -; VI-NEXT: v_mov_b32_e32 v38, v55 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v17 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v35, v58, 8, 8 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v39, v61, 8, 8 -; VI-NEXT: v_bfe_u32 v58, v48, 8, 8 -; VI-NEXT: v_mov_b32_e32 v55, v31 -; VI-NEXT: v_bfe_u32 v61, v53, 8, 8 -; VI-NEXT: v_bfe_u32 v31, v38, 8, 8 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[39:40] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v39 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v40 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v40 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v37, v15 +; VI-NEXT: v_mov_b32_e32 v31, v16 +; VI-NEXT: v_mov_b32_e32 v0, v17 +; VI-NEXT: v_mov_b32_e32 v36, v18 +; VI-NEXT: v_mov_b32_e32 v47, v19 +; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v20 +; VI-NEXT: v_mov_b32_e32 v46, v20 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v24 +; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v26 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v38, 24, v30 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v40 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: .LBB98_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB98_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v37, 3 +; VI-NEXT: v_add_u16_sdwa v59, v22, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v31, 3, v22 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v22, v31, v22 +; VI-NEXT: v_add_u16_e32 v31, 3, v21 +; VI-NEXT: v_add_u16_sdwa v21, v21, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_add_u16_sdwa v61, v24, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v21, v31, v21 +; VI-NEXT: v_add_u16_e32 v31, 3, v24 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v24, v31, v24 +; VI-NEXT: v_add_u16_e32 v31, 3, v23 +; VI-NEXT: v_add_u16_sdwa v23, v23, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_add_u16_sdwa v58, v26, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v23, v31, v23 +; VI-NEXT: v_add_u16_e32 v31, 3, v26 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v26, v31, v26 +; VI-NEXT: v_add_u16_e32 v31, 3, v25 +; VI-NEXT: v_add_u16_sdwa v25, v25, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_add_u16_sdwa v57, v28, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v25, v31, v25 +; VI-NEXT: v_add_u16_e32 v31, 3, v28 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v57 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v28, v31, v28 +; VI-NEXT: v_add_u16_e32 v31, 3, v27 +; VI-NEXT: v_add_u16_sdwa v27, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_add_u16_sdwa v38, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v27, v31, v27 +; VI-NEXT: v_add_u16_e32 v31, 3, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v38 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v30, v31, v30 +; VI-NEXT: v_add_u16_e32 v31, 3, v29 +; VI-NEXT: v_add_u16_sdwa v29, v29, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_add_u16_sdwa v56, v40, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v29, v31, v29 +; VI-NEXT: v_add_u16_e32 v33, 3, v40 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v56 +; VI-NEXT: v_or_b32_e32 v40, v33, v31 +; VI-NEXT: v_add_u16_sdwa v31, v39, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v33, 3, v39 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_u16_sdwa v41, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v39, v33, v31 +; VI-NEXT: v_add_u16_e32 v31, 3, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; VI-NEXT: v_add_u16_sdwa v35, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v2, v31, v2 +; VI-NEXT: v_add_u16_e32 v31, 3, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v35 +; VI-NEXT: v_add_u16_sdwa v51, v4, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v1, v31, v1 +; VI-NEXT: v_add_u16_e32 v31, 3, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; VI-NEXT: v_add_u16_sdwa v34, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v4, v31, v4 +; VI-NEXT: v_add_u16_e32 v31, 3, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; VI-NEXT: v_add_u16_sdwa v53, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v3, v31, v3 +; VI-NEXT: v_add_u16_e32 v31, 3, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; VI-NEXT: v_add_u16_sdwa v45, v5, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v6, v31, v6 +; VI-NEXT: v_add_u16_e32 v31, 3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 +; VI-NEXT: v_add_u16_sdwa v54, v8, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v5, v31, v5 +; VI-NEXT: v_add_u16_e32 v31, 3, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v54 +; VI-NEXT: v_add_u16_sdwa v33, v7, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v8, v31, v8 +; VI-NEXT: v_add_u16_e32 v31, 3, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; VI-NEXT: v_add_u16_sdwa v52, v10, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v7, v31, v7 +; VI-NEXT: v_add_u16_e32 v31, 3, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v52 +; VI-NEXT: v_add_u16_sdwa v60, v9, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v10, v31, v10 +; VI-NEXT: v_add_u16_e32 v31, 3, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 +; VI-NEXT: v_add_u16_sdwa v50, v12, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v32, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v9, v31, v9 +; VI-NEXT: v_add_u16_e32 v31, 3, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v50 +; VI-NEXT: v_add_u16_sdwa v44, v11, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v12, v31, v12 +; VI-NEXT: v_add_u16_e32 v31, 3, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v44 +; VI-NEXT: v_add_u16_sdwa v49, v14, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v18, v0, v18 +; VI-NEXT: v_add_u16_e32 v0, 3, v17 +; VI-NEXT: v_add_u16_sdwa v17, v17, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v11, v31, v11 +; VI-NEXT: v_add_u16_e32 v31, 3, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; VI-NEXT: v_add_u16_sdwa v43, v13, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v14, v31, v14 +; VI-NEXT: v_add_u16_e32 v31, 3, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; VI-NEXT: v_add_u16_sdwa v48, v16, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v63, v15, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v17, v0, v17 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v13, v31, v13 +; VI-NEXT: v_add_u16_e32 v0, 3, v16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; VI-NEXT: v_add_u16_e32 v31, 3, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v63 +; VI-NEXT: v_or_b32_e32 v16, v0, v16 +; VI-NEXT: v_or_b32_e32 v15, v31, v15 +; VI-NEXT: v_add_u16_sdwa v62, v20, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v42, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16] +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v37, v31 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v16 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v15 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v14 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v9 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[3:4] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[39:40] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v40 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[29:30] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v29 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[27:28] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v28 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[25:26] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v26 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v25 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[23:24] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v23 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[21:22] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v46, 3, v20 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v62 +; VI-NEXT: v_add_u16_e32 v47, 3, v19 +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v42 +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v22 +; VI-NEXT: v_or_b32_e32 v20, v46, v20 +; VI-NEXT: v_or_b32_e32 v19, v47, v19 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v21 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[19:20] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v20 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v19 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[17:18] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v18 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v17 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v0, v48, 8, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v0, v49, 8, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v0, v50, 8, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v0, v52, 8, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v0, v54, 8, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v0, v53, 8, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v0, v51, 8, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v0, v41, 8, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v0, v59, 8, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v0, v32, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v42 +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v56, v56, 8, 8 +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v38, v38, 8, 8 +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v57, v57, 8, 8 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v58, v58, 8, 8 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v61, v61, 8, 8 +; VI-NEXT: v_bfe_u32 v62, v62, 8, 8 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: .LBB98_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v1, v55, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 -; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v32 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 -; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v62 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41 -; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v58 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v39 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v58 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v55 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v55 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -212007,65 +211443,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill @@ -212083,71 +211460,43 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -212159,6 +211508,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -212169,6 +211522,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -212179,6 +211536,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -212189,6 +211550,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -212199,196 +211564,295 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; kill: killed $vgpr34 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB98_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: s_waitcnt vmcnt(62) +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v3 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(62) +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 -; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[34:35], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 24, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v17 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v17 ; GFX9-NEXT: .LBB98_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB98_4 @@ -212398,507 +211862,521 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(60) +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] ; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] ; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] ; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] ; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v3 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[46:47], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 24, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v17 ; GFX9-NEXT: .LBB98_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v63 -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v38 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v44 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v12, v12, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v49 -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v42 -; GFX9-NEXT: v_or_b32_sdwa v34, v58, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v34 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v34, v34, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 -; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v61, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 @@ -212906,11 +212384,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -212919,10 +212397,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -212932,11 +212410,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -212945,12 +212423,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -213229,17 +212710,17 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v167.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v166.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v166.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v165.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v160.l @@ -213454,101 +212935,101 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr73 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr63 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr61 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr58 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr56 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr46 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr44 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr43 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr40 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr182 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr180 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr178 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr176 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr166 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr165 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr162 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr160 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr151 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr148 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr146 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr145 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr134 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr132 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr130 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr128 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr118 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr117 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr116 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr103 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr101 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr98 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr97 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr87 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71 -; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr62 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr60 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr59 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr57 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr47 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr45 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr42 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr41 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr183 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr181 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr179 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr177 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr167 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr164 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr163 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr161 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr150 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr149 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr147 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr144 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr135 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr133 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr131 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr129 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr119 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr115 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr114 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr113 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr112 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr102 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr100 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33 @@ -213557,375 +213038,375 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 ; GFX11-FAKE16-NEXT: .LBB98_2: ; %Flow ; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB98_4 ; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] ; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 24, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 8, v16 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 8, v15 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 24, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 8, v14 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 8, v13 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v11 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 8, v10 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v9 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 24, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 16, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v8 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 16, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 8, v7 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v6 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v5 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v4 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v3 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 16, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 8, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 24, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 8, v31 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 24, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v30 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v29 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 24, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 8, v28 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 8, v27 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 24, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 8, v26 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 8, v25 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 24, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 16, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 8, v24 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 16, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 8, v23 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 24, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 16, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 8, v22 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 16, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 8, v21 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 24, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 16, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 8, v19 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 24, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v17 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-FAKE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 24, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 8, v16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 8, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 24, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 8, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 24, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 8, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 8, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 8, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v148, 8, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v151, 24, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v160, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v162, 8, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v165, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v166, 8, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v176, 24, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v178, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v180, 8, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v182, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v40, 8, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v43, 24, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v46, 8, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v56, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v58, 8, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v72, 8, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v73, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 24, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 8, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v86, 24, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 8, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 24, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 8, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 24, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 8, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 8, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v149, 24, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v150, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v161, 8, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v163, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v164, 8, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v167, 24, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v177, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v179, 8, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v181, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v183, 8, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v41, 24, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v45, 8, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v57, 8, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v59, 24, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v74, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 ; GFX11-FAKE16-NEXT: .LBB98_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v74 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v64 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v75 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v73 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v68 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v60 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v72 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v61 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v47 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v68 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v72 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v63 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v63 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v54 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v42 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v62 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v53 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v68, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v58 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xff, v56 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v55, v55, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v57 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v45 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v43 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v51, 8, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v68 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v69, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v46 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v43 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v64, v67 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v183 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v181 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v68, v69 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v182 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v64, v53 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v179 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v177 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v167 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v68, v66 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v180 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v178 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v68, 8, v176 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, v55, v64 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v54, 8, v165 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xff, v164 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v68 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v66, 8, v166 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v67, 0xff, v165 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v51 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v54 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v55, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v161 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v151 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v150 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v39 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v67, v65 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v162 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v160 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v151 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v66, v67 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v146 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v54, v55 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v145 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v147 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v148 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v144 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v145 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v54, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v52 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 16, v51 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xff, v133 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v131 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v66, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v54 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v64 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v134 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v64, 8, v132 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v53, 8, v129 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xff, v119 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v50, 8, v50 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v65, 8, v130 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xff, v128 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v52, 8, v52 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v55, 8, v117 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v64, 0xff, v116 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v115 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v51, v51, v52 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v53 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, v54, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v55 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v64, v67 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v67, 8, v118 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xff, v117 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v69, 8, v116 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, v54, v64 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v65 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v52, v66, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v67 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, v68, v69 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64 ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v39 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v51 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v50 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v51 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v10, v54 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v11, v52 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v12, v64 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v13 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v102 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v100 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v103 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v101 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v8, 8, v49 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v98 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v97 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v10, 8, v99 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v98 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v12, 8, v97 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v15 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v86 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v14, 8, v87 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v7, v8 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v9, v10 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v83 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v48 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v16 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v80 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v70 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v68 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v81 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v71 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v55 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v75 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v73 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v66 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v39 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v39, 0xff, v74 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v48, 8, v53 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 @@ -213947,30 +213428,30 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v12 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v14 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v61 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v59 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v58 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v11, 8, v62 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v60 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v13, 8, v59 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v19 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v56 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v46 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v65 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v15, 8, v57 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v47 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v17, 8, v50 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v20 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v44 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v19, 8, v45 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v41 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v40 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v41 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v21 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v182 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v180 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v183 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v181 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v38 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v178 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v176 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v166 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v179 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v38, 0xff, v177 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v39, 8, v167 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 @@ -213992,29 +213473,29 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v16, v17 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v19 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v23 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v163 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v162 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v16, 8, v164 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v163 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v18, 8, v37 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xff, v24 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v160 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v148 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v20, 8, v161 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v150 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v22, 8, v149 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v25 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v146 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v24, 8, v147 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v17, v18 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v19, v20 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v144 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v36 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v26 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v134 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v132 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v130 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v135 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v133 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v131 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v128 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v129 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xff, v119 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v35, 8, v35 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 @@ -214037,31 +213518,31 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v21, v22 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v23, v24 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v114 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v113 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v112 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v21, 8, v115 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v114 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v23, 8, v113 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xff, v29 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v103 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v101 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v25, 8, v112 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v102 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v27, 8, v34 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v30 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v99 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v29, 8, v100 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v87 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v96 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v26, 8, v86 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v31 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v84 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v28, 8, v85 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xff, v84 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v30, 8, v33 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v32 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v81 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v71 -; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v69 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v32, 8, v82 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xff, v80 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v34, 8, v70 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v25, v26 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v27, v28 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v29, v30 @@ -214171,75 +213652,63 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v40, s68, 20 ; SI-NEXT: v_writelane_b32 v40, s69, 21 ; SI-NEXT: v_writelane_b32 v40, s70, 22 -; SI-NEXT: s_mov_b32 s88, s17 ; SI-NEXT: v_writelane_b32 v40, s71, 23 ; SI-NEXT: v_writelane_b32 v40, s80, 24 ; SI-NEXT: v_writelane_b32 v40, s81, 25 ; SI-NEXT: v_writelane_b32 v40, s82, 26 ; SI-NEXT: v_writelane_b32 v40, s83, 27 -; SI-NEXT: v_readfirstlane_b32 s6, v16 -; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s7, v15 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_writelane_b32 v41, s6, 0 -; SI-NEXT: v_readfirstlane_b32 s8, v21 -; SI-NEXT: v_writelane_b32 v41, s7, 1 -; SI-NEXT: v_readfirstlane_b32 s9, v20 -; SI-NEXT: v_writelane_b32 v41, s8, 2 -; SI-NEXT: v_readfirstlane_b32 s10, v19 -; SI-NEXT: v_writelane_b32 v41, s9, 3 -; SI-NEXT: v_readfirstlane_b32 s11, v25 -; SI-NEXT: v_writelane_b32 v41, s10, 4 -; SI-NEXT: v_readfirstlane_b32 s12, v24 -; SI-NEXT: v_writelane_b32 v41, s11, 5 -; SI-NEXT: v_readfirstlane_b32 s13, v23 -; SI-NEXT: v_writelane_b32 v41, s12, 6 -; SI-NEXT: v_readfirstlane_b32 s15, v29 -; SI-NEXT: v_writelane_b32 v41, s13, 7 -; SI-NEXT: v_readfirstlane_b32 s14, v28 -; SI-NEXT: v_writelane_b32 v41, s15, 8 -; SI-NEXT: s_mov_b32 s79, s16 -; SI-NEXT: v_readfirstlane_b32 s16, v27 -; SI-NEXT: v_writelane_b32 v41, s14, 9 -; SI-NEXT: v_writelane_b32 v41, s16, 10 ; SI-NEXT: v_writelane_b32 v40, s84, 28 ; SI-NEXT: v_writelane_b32 v40, s85, 29 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane +; SI-NEXT: s_mov_b32 s6, s22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_writelane_b32 v41, s18, 0 +; SI-NEXT: v_writelane_b32 v41, s17, 1 +; SI-NEXT: v_writelane_b32 v41, s16, 2 +; SI-NEXT: v_writelane_b32 v41, s6, 3 +; SI-NEXT: v_writelane_b32 v41, s21, 4 +; SI-NEXT: v_writelane_b32 v41, s20, 5 +; SI-NEXT: v_writelane_b32 v41, s26, 6 +; SI-NEXT: v_writelane_b32 v41, s25, 7 +; SI-NEXT: v_writelane_b32 v41, s24, 8 +; SI-NEXT: v_writelane_b32 v41, s29, 9 +; SI-NEXT: v_readfirstlane_b32 s8, v5 +; SI-NEXT: v_writelane_b32 v41, s28, 10 +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: v_writelane_b32 v41, s8, 11 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: v_writelane_b32 v41, s9, 12 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_writelane_b32 v41, s10, 13 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_writelane_b32 v41, s11, 14 +; SI-NEXT: v_readfirstlane_b32 s13, v8 +; SI-NEXT: v_writelane_b32 v41, s12, 15 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_writelane_b32 v41, s13, 16 +; SI-NEXT: v_readfirstlane_b32 s15, v13 +; SI-NEXT: v_writelane_b32 v41, s14, 17 ; SI-NEXT: v_writelane_b32 v40, s86, 30 +; SI-NEXT: v_readfirstlane_b32 s40, v12 +; SI-NEXT: v_writelane_b32 v41, s15, 18 ; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: v_readfirstlane_b32 s41, v11 +; SI-NEXT: v_writelane_b32 v41, s40, 19 ; SI-NEXT: v_writelane_b32 v40, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s42, v17 +; SI-NEXT: v_writelane_b32 v41, s41, 20 ; SI-NEXT: v_writelane_b32 v40, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s43, v16 +; SI-NEXT: v_writelane_b32 v41, s42, 21 ; SI-NEXT: v_writelane_b32 v40, s98, 34 -; SI-NEXT: v_writelane_b32 v40, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s98, v30 -; SI-NEXT: v_readfirstlane_b32 s97, v26 -; SI-NEXT: v_readfirstlane_b32 s96, v22 -; SI-NEXT: v_readfirstlane_b32 s87, v18 -; SI-NEXT: v_readfirstlane_b32 s81, v17 -; SI-NEXT: v_readfirstlane_b32 s86, v14 -; SI-NEXT: v_readfirstlane_b32 s67, v13 -; SI-NEXT: v_readfirstlane_b32 s69, v12 -; SI-NEXT: v_readfirstlane_b32 s71, v11 -; SI-NEXT: v_readfirstlane_b32 s85, v10 -; SI-NEXT: v_readfirstlane_b32 s51, v9 -; SI-NEXT: v_readfirstlane_b32 s53, v8 +; SI-NEXT: v_readfirstlane_b32 s44, v15 +; SI-NEXT: v_writelane_b32 v41, s43, 22 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s89, v31 +; SI-NEXT: v_readfirstlane_b32 s77, v31 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s91, v32 +; SI-NEXT: v_readfirstlane_b32 s76, v32 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s93, v33 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s55, v34 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s17, v35 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s95, v36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s35, v37 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: v_readfirstlane_b32 s78, v33 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 @@ -214249,1056 +213718,1075 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s83, v38 +; SI-NEXT: v_readfirstlane_b32 s90, v34 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s85, v35 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s7, v36 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s92, v38 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: v_readfirstlane_b32 s65, v7 -; SI-NEXT: v_readfirstlane_b32 s84, v6 -; SI-NEXT: v_readfirstlane_b32 s31, v5 -; SI-NEXT: v_readfirstlane_b32 s37, v4 -; SI-NEXT: v_readfirstlane_b32 s49, v3 -; SI-NEXT: v_readfirstlane_b32 s78, v2 -; SI-NEXT: v_readfirstlane_b32 s39, v1 -; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s79, v37 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s39, v30 +; SI-NEXT: v_readfirstlane_b32 s48, v29 +; SI-NEXT: v_readfirstlane_b32 s89, v28 +; SI-NEXT: v_readfirstlane_b32 s53, v27 +; SI-NEXT: v_readfirstlane_b32 s34, v26 +; SI-NEXT: v_readfirstlane_b32 s37, v25 +; SI-NEXT: v_readfirstlane_b32 s38, v24 +; SI-NEXT: v_readfirstlane_b32 s22, v23 +; SI-NEXT: v_readfirstlane_b32 s93, v22 +; SI-NEXT: v_readfirstlane_b32 s94, v21 +; SI-NEXT: v_readfirstlane_b32 s95, v20 +; SI-NEXT: v_readfirstlane_b32 s36, v19 +; SI-NEXT: v_readfirstlane_b32 s88, v18 +; SI-NEXT: v_readfirstlane_b32 s91, v14 +; SI-NEXT: v_readfirstlane_b32 s99, v10 +; SI-NEXT: v_readfirstlane_b32 s86, v6 +; SI-NEXT: v_readfirstlane_b32 s84, v2 +; SI-NEXT: v_writelane_b32 v41, s44, 23 +; SI-NEXT: v_writelane_b32 v41, s94, 24 ; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s67, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s80, v32 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s96, v33 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s69, v39 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s77, v31 +; SI-NEXT: v_readfirstlane_b32 s66, v48 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s38, v32 +; SI-NEXT: v_readfirstlane_b32 s70, v49 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s48, v33 +; SI-NEXT: v_readfirstlane_b32 s65, v50 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s50, v39 -; SI-NEXT: v_readfirstlane_b32 s90, v35 -; SI-NEXT: v_readfirstlane_b32 s92, v36 -; SI-NEXT: v_writelane_b32 v41, s90, 11 -; SI-NEXT: v_readfirstlane_b32 s94, v37 -; SI-NEXT: v_writelane_b32 v41, s92, 12 +; SI-NEXT: v_readfirstlane_b32 s87, v51 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s30, v49 -; SI-NEXT: v_writelane_b32 v41, s94, 13 +; SI-NEXT: v_readfirstlane_b32 s52, v34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s34, v50 -; SI-NEXT: v_writelane_b32 v41, s30, 14 +; SI-NEXT: v_readfirstlane_b32 s54, v35 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s36, v51 -; SI-NEXT: v_writelane_b32 v41, s34, 15 -; SI-NEXT: v_writelane_b32 v41, s36, 16 +; SI-NEXT: v_readfirstlane_b32 s64, v36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 -; SI-NEXT: v_writelane_b32 v41, s38, 17 -; SI-NEXT: v_readfirstlane_b32 s76, v48 -; SI-NEXT: v_readfirstlane_b32 s99, v34 +; SI-NEXT: v_readfirstlane_b32 s68, v37 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_writelane_b32 v41, s48, 18 -; SI-NEXT: v_writelane_b32 v41, s50, 19 ; SI-NEXT: s_cbranch_scc0 .LBB99_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s79, 0xffff -; SI-NEXT: s_lshl_b32 s5, s88, 16 -; SI-NEXT: s_or_b32 s60, s4, s5 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s74, s4, s5 ; SI-NEXT: s_and_b32 s4, s18, 0xffff ; SI-NEXT: s_lshl_b32 s5, s19, 16 -; SI-NEXT: s_or_b32 s61, s4, s5 +; SI-NEXT: s_or_b32 s75, s4, s5 ; SI-NEXT: s_and_b32 s4, s20, 0xffff ; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: s_or_b32 s56, s4, s5 -; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_or_b32 s72, s4, s5 +; SI-NEXT: s_and_b32 s4, s6, 0xffff ; SI-NEXT: s_lshl_b32 s5, s23, 16 -; SI-NEXT: s_or_b32 s57, s4, s5 +; SI-NEXT: s_or_b32 s73, s4, s5 ; SI-NEXT: s_and_b32 s4, s24, 0xffff ; SI-NEXT: s_lshl_b32 s5, s25, 16 -; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_or_b32 s62, s4, s5 ; SI-NEXT: s_and_b32 s4, s26, 0xffff ; SI-NEXT: s_lshl_b32 s5, s27, 16 -; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_or_b32 s63, s4, s5 ; SI-NEXT: s_and_b32 s4, s28, 0xffff ; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: s_or_b32 s74, s4, s5 -; SI-NEXT: s_and_b32 s4, s39, 0xffff -; SI-NEXT: s_lshl_b32 s5, s78, 16 -; SI-NEXT: s_or_b32 s75, s4, s5 -; SI-NEXT: s_and_b32 s4, s49, 0xffff -; SI-NEXT: s_lshl_b32 s5, s37, 16 -; SI-NEXT: s_or_b32 s72, s4, s5 -; SI-NEXT: s_and_b32 s4, s31, 0xffff +; SI-NEXT: s_or_b32 s60, s4, s5 +; SI-NEXT: s_and_b32 s4, s10, 0xffff ; SI-NEXT: s_lshl_b32 s5, s84, 16 -; SI-NEXT: s_or_b32 s73, s4, s5 -; SI-NEXT: s_and_b32 s4, s65, 0xffff -; SI-NEXT: s_lshl_b32 s5, s53, 16 -; SI-NEXT: s_or_b32 s62, s4, s5 -; SI-NEXT: s_and_b32 s4, s51, 0xffff -; SI-NEXT: s_lshl_b32 s5, s85, 16 -; SI-NEXT: s_or_b32 s63, s4, s5 -; SI-NEXT: s_and_b32 s4, s71, 0xffff -; SI-NEXT: s_lshl_b32 s5, s69, 16 +; SI-NEXT: s_or_b32 s61, s4, s5 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 ; SI-NEXT: s_or_b32 s58, s4, s5 -; SI-NEXT: s_and_b32 s4, s67, 0xffff +; SI-NEXT: s_and_b32 s4, s8, 0xffff ; SI-NEXT: s_lshl_b32 s5, s86, 16 ; SI-NEXT: s_or_b32 s59, s4, s5 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_and_b32 s4, s14, 0xffff +; SI-NEXT: s_lshl_b32 s5, s13, 16 +; SI-NEXT: s_or_b32 s56, s4, s5 +; SI-NEXT: s_and_b32 s4, s12, 0xffff +; SI-NEXT: s_lshl_b32 s5, s99, 16 +; SI-NEXT: s_or_b32 s57, s4, s5 +; SI-NEXT: s_and_b32 s4, s41, 0xffff +; SI-NEXT: s_lshl_b32 s5, s40, 16 ; SI-NEXT: s_or_b32 s46, s4, s5 -; SI-NEXT: s_and_b32 s4, s81, 0xffff -; SI-NEXT: s_lshl_b32 s5, s87, 16 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: s_lshl_b32 s5, s91, 16 ; SI-NEXT: s_or_b32 s47, s4, s5 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_and_b32 s4, s44, 0xffff +; SI-NEXT: s_lshl_b32 s5, s43, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s42, 0xffff +; SI-NEXT: s_lshl_b32 s5, s88, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s36, 0xffff +; SI-NEXT: s_lshl_b32 s5, s95, 16 ; SI-NEXT: s_or_b32 s42, s4, s5 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: s_and_b32 s4, s94, 0xffff +; SI-NEXT: s_lshl_b32 s5, s93, 16 ; SI-NEXT: s_or_b32 s43, s4, s5 -; SI-NEXT: s_and_b32 s4, s13, 0xffff -; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s38, 16 ; SI-NEXT: s_or_b32 s40, s4, s5 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: s_lshl_b32 s5, s97, 16 +; SI-NEXT: s_and_b32 s4, s37, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 16 ; SI-NEXT: s_or_b32 s41, s4, s5 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_and_b32 s4, s53, 0xffff +; SI-NEXT: s_lshl_b32 s5, s89, 16 ; SI-NEXT: s_or_b32 s14, s4, s5 -; SI-NEXT: s_and_b32 s4, s15, 0xffff -; SI-NEXT: s_lshl_b32 s5, s98, 16 +; SI-NEXT: s_and_b32 s4, s48, 0xffff +; SI-NEXT: s_lshl_b32 s5, s39, 16 ; SI-NEXT: s_or_b32 s15, s4, s5 -; SI-NEXT: s_and_b32 s4, s94, 0xffff -; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: s_and_b32 s4, s68, 0xffff +; SI-NEXT: s_lshl_b32 s5, s64, 16 ; SI-NEXT: s_or_b32 s12, s4, s5 -; SI-NEXT: s_and_b32 s4, s90, 0xffff -; SI-NEXT: s_lshl_b32 s5, s99, 16 +; SI-NEXT: s_and_b32 s4, s54, 0xffff +; SI-NEXT: s_lshl_b32 s5, s52, 16 ; SI-NEXT: s_or_b32 s13, s4, s5 -; SI-NEXT: s_and_b32 s4, s36, 0xffff -; SI-NEXT: s_lshl_b32 s5, s34, 16 +; SI-NEXT: s_and_b32 s4, s87, 0xffff +; SI-NEXT: s_lshl_b32 s5, s65, 16 ; SI-NEXT: s_or_b32 s10, s4, s5 -; SI-NEXT: s_and_b32 s4, s30, 0xffff -; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: s_and_b32 s4, s70, 0xffff +; SI-NEXT: s_lshl_b32 s5, s66, 16 ; SI-NEXT: s_or_b32 s11, s4, s5 -; SI-NEXT: s_and_b32 s4, s50, 0xffff -; SI-NEXT: s_lshl_b32 s5, s48, 16 +; SI-NEXT: s_and_b32 s4, s69, 0xffff +; SI-NEXT: s_lshl_b32 s5, s96, 16 ; SI-NEXT: s_or_b32 s8, s4, s5 -; SI-NEXT: s_and_b32 s4, s38, 0xffff -; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: s_and_b32 s4, s80, 0xffff +; SI-NEXT: s_lshl_b32 s5, s67, 16 ; SI-NEXT: s_or_b32 s9, s4, s5 -; SI-NEXT: s_and_b32 s4, s83, 0xffff -; SI-NEXT: s_lshl_b32 s5, s35, 16 +; SI-NEXT: s_and_b32 s4, s92, 0xffff +; SI-NEXT: s_lshl_b32 s5, s79, 16 ; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: s_and_b32 s4, s95, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: s_lshl_b32 s5, s85, 16 +; SI-NEXT: s_mov_b32 s31, s7 ; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: s_and_b32 s4, s55, 0xffff -; SI-NEXT: s_lshl_b32 s5, s93, 16 +; SI-NEXT: s_and_b32 s4, s90, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s91, 0xffff -; SI-NEXT: s_lshl_b32 s16, s89, 16 +; SI-NEXT: s_and_b32 s5, s76, 0xffff +; SI-NEXT: s_lshl_b32 s16, s77, 16 ; SI-NEXT: s_or_b32 s5, s5, s16 -; SI-NEXT: s_lshr_b32 s16, s61, 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v43, s16, 20 -; SI-NEXT: s_lshr_b32 s16, s57, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 23 -; SI-NEXT: s_lshr_b32 s16, s45, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 26 -; SI-NEXT: s_lshr_b32 s16, s75, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 29 -; SI-NEXT: s_lshr_b32 s16, s73, 8 -; SI-NEXT: s_lshr_b64 vcc, s[60:61], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 32 -; SI-NEXT: s_lshr_b32 s16, s63, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 22 -; SI-NEXT: v_writelane_b32 v43, s16, 35 -; SI-NEXT: s_lshr_b32 s16, s59, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 23 -; SI-NEXT: s_lshr_b64 vcc, s[60:61], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 38 +; SI-NEXT: s_lshr_b64 s[16:17], s[74:75], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 29 +; SI-NEXT: v_writelane_b32 v41, s17, 30 +; SI-NEXT: s_lshr_b64 s[16:17], s[74:75], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 27 +; SI-NEXT: v_writelane_b32 v41, s17, 28 +; SI-NEXT: s_lshr_b64 s[16:17], s[74:75], 8 +; SI-NEXT: v_writelane_b32 v41, s16, 25 +; SI-NEXT: v_writelane_b32 v41, s17, 26 +; SI-NEXT: s_lshr_b64 s[16:17], s[72:73], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 35 +; SI-NEXT: v_writelane_b32 v41, s17, 36 +; SI-NEXT: s_lshr_b64 s[16:17], s[72:73], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 33 +; SI-NEXT: v_writelane_b32 v41, s17, 34 +; SI-NEXT: s_lshr_b64 s[16:17], s[72:73], 8 +; SI-NEXT: v_writelane_b32 v41, s16, 31 +; SI-NEXT: v_writelane_b32 v41, s17, 32 +; SI-NEXT: s_lshr_b64 s[16:17], s[62:63], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 41 +; SI-NEXT: v_writelane_b32 v41, s17, 42 +; SI-NEXT: s_lshr_b64 s[16:17], s[62:63], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 39 +; SI-NEXT: v_writelane_b32 v41, s17, 40 +; SI-NEXT: s_lshr_b64 s[16:17], s[62:63], 8 +; SI-NEXT: v_writelane_b32 v41, s16, 37 +; SI-NEXT: v_writelane_b32 v41, s17, 38 +; SI-NEXT: s_lshr_b64 s[16:17], s[60:61], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 47 +; SI-NEXT: v_writelane_b32 v41, s17, 48 +; SI-NEXT: s_lshr_b64 s[16:17], s[60:61], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 45 +; SI-NEXT: v_writelane_b32 v41, s17, 46 +; SI-NEXT: s_lshr_b64 s[16:17], s[60:61], 8 +; SI-NEXT: v_writelane_b32 v41, s16, 43 +; SI-NEXT: v_writelane_b32 v41, s17, 44 +; SI-NEXT: s_lshr_b64 s[16:17], s[58:59], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 53 +; SI-NEXT: v_writelane_b32 v41, s17, 54 +; SI-NEXT: s_lshr_b64 s[16:17], s[58:59], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 51 +; SI-NEXT: v_writelane_b32 v41, s17, 52 +; SI-NEXT: s_lshr_b64 s[16:17], s[58:59], 8 +; SI-NEXT: v_writelane_b32 v41, s16, 49 +; SI-NEXT: v_writelane_b32 v41, s17, 50 +; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 59 +; SI-NEXT: v_writelane_b32 v41, s17, 60 +; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 57 +; SI-NEXT: v_writelane_b32 v41, s17, 58 +; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 8 +; SI-NEXT: v_writelane_b32 v41, s16, 55 +; SI-NEXT: v_writelane_b32 v41, s17, 56 +; SI-NEXT: s_lshr_b64 s[16:17], s[46:47], 24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v42, s16, 1 +; SI-NEXT: v_writelane_b32 v42, s17, 2 +; SI-NEXT: s_lshr_b64 s[16:17], s[46:47], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 63 +; SI-NEXT: v_writelane_b32 v42, s17, 0 +; SI-NEXT: s_lshr_b64 s[16:17], s[46:47], 8 +; SI-NEXT: v_writelane_b32 v41, s16, 61 +; SI-NEXT: v_writelane_b32 v41, s17, 62 +; SI-NEXT: s_lshr_b64 s[16:17], s[44:45], 24 +; SI-NEXT: v_writelane_b32 v42, s16, 7 +; SI-NEXT: v_writelane_b32 v42, s17, 8 +; SI-NEXT: s_lshr_b64 s[16:17], s[44:45], 16 +; SI-NEXT: v_writelane_b32 v42, s16, 5 +; SI-NEXT: v_writelane_b32 v42, s17, 6 +; SI-NEXT: s_lshr_b64 s[16:17], s[44:45], 8 +; SI-NEXT: v_writelane_b32 v42, s16, 3 +; SI-NEXT: v_writelane_b32 v42, s17, 4 +; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 24 +; SI-NEXT: v_writelane_b32 v42, s16, 13 +; SI-NEXT: v_writelane_b32 v42, s17, 14 +; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 16 +; SI-NEXT: v_writelane_b32 v42, s16, 11 +; SI-NEXT: v_writelane_b32 v42, s17, 12 +; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 8 +; SI-NEXT: v_writelane_b32 v42, s16, 9 +; SI-NEXT: v_writelane_b32 v42, s17, 10 +; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v42, s16, 19 +; SI-NEXT: v_writelane_b32 v42, s17, 20 +; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v42, s16, 17 +; SI-NEXT: v_writelane_b32 v42, s17, 18 +; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v42, s16, 15 +; SI-NEXT: v_writelane_b32 v42, s17, 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v42, s16, 25 +; SI-NEXT: v_writelane_b32 v42, s17, 26 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v42, s16, 23 +; SI-NEXT: v_writelane_b32 v42, s17, 24 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 +; SI-NEXT: v_writelane_b32 v42, s16, 21 +; SI-NEXT: v_writelane_b32 v42, s17, 22 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v42, s16, 31 +; SI-NEXT: v_writelane_b32 v42, s17, 32 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 +; SI-NEXT: v_writelane_b32 v42, s16, 29 +; SI-NEXT: v_writelane_b32 v42, s17, 30 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 8 +; SI-NEXT: v_writelane_b32 v42, s16, 27 +; SI-NEXT: v_writelane_b32 v42, s17, 28 +; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v42, s16, 37 +; SI-NEXT: v_writelane_b32 v42, s17, 38 +; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v42, s16, 35 +; SI-NEXT: v_writelane_b32 v42, s17, 36 +; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v42, s16, 33 +; SI-NEXT: v_writelane_b32 v42, s17, 34 +; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v42, s16, 43 +; SI-NEXT: v_writelane_b32 v42, s17, 44 +; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v42, s16, 41 +; SI-NEXT: v_writelane_b32 v42, s17, 42 +; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v42, s16, 39 +; SI-NEXT: v_writelane_b32 v42, s17, 40 +; SI-NEXT: s_lshr_b64 s[16:17], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v42, s16, 49 +; SI-NEXT: v_writelane_b32 v42, s17, 50 +; SI-NEXT: s_lshr_b64 s[16:17], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v42, s16, 47 +; SI-NEXT: v_writelane_b32 v42, s17, 48 +; SI-NEXT: s_lshr_b64 s[16:17], s[6:7], 8 +; SI-NEXT: v_writelane_b32 v42, s16, 45 +; SI-NEXT: v_writelane_b32 v42, s17, 46 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v42, s16, 55 +; SI-NEXT: v_writelane_b32 v42, s17, 56 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v42, s16, 53 +; SI-NEXT: v_writelane_b32 v42, s17, 54 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v42, s16, 51 +; SI-NEXT: v_writelane_b32 v42, s17, 52 ; SI-NEXT: s_lshr_b32 s16, s47, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 20 -; SI-NEXT: v_writelane_b32 v43, s16, 41 +; SI-NEXT: v_writelane_b32 v42, s16, 57 +; SI-NEXT: s_lshr_b32 s16, s45, 8 +; SI-NEXT: v_writelane_b32 v42, s16, 59 ; SI-NEXT: s_lshr_b32 s16, s43, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 21 -; SI-NEXT: s_lshr_b64 vcc, s[56:57], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 44 +; SI-NEXT: v_writelane_b32 v42, s16, 61 ; SI-NEXT: s_lshr_b32 s16, s41, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 28 -; SI-NEXT: v_writelane_b32 v43, s16, 47 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v43, s16, 0 ; SI-NEXT: s_lshr_b32 s16, s15, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 29 -; SI-NEXT: s_lshr_b64 vcc, s[56:57], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 50 +; SI-NEXT: v_writelane_b32 v43, s16, 3 ; SI-NEXT: s_lshr_b32 s16, s13, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 26 -; SI-NEXT: v_writelane_b32 v43, s16, 53 +; SI-NEXT: v_writelane_b32 v43, s16, 6 ; SI-NEXT: s_lshr_b32 s16, s11, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 27 -; SI-NEXT: s_lshr_b64 vcc, s[56:57], 8 -; SI-NEXT: v_writelane_b32 v43, s16, 56 +; SI-NEXT: v_writelane_b32 v43, s16, 9 ; SI-NEXT: s_lshr_b32 s16, s9, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 59 +; SI-NEXT: s_and_b32 s17, s88, 0xffff +; SI-NEXT: v_writelane_b32 v43, s16, 12 ; SI-NEXT: s_lshr_b32 s16, s7, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 25 -; SI-NEXT: s_lshr_b64 vcc, s[46:47], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 62 +; SI-NEXT: v_writelane_b32 v42, s17, 58 +; SI-NEXT: s_and_b32 s17, s93, 0xffff +; SI-NEXT: v_writelane_b32 v43, s16, 15 ; SI-NEXT: s_lshr_b32 s16, s5, 8 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 32 -; SI-NEXT: v_writelane_b32 v42, s16, 1 -; SI-NEXT: s_and_b32 s16, s19, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 33 -; SI-NEXT: s_lshr_b64 vcc, s[46:47], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 19 -; SI-NEXT: s_and_b32 s16, s23, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 30 -; SI-NEXT: v_writelane_b32 v43, s16, 22 -; SI-NEXT: s_and_b32 s16, s27, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 31 -; SI-NEXT: s_lshr_b64 vcc, s[42:43], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 25 -; SI-NEXT: s_and_b32 s16, s78, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 38 -; SI-NEXT: v_writelane_b32 v43, s16, 28 -; SI-NEXT: s_and_b32 s16, s84, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 39 -; SI-NEXT: s_lshr_b64 vcc, s[42:43], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 31 -; SI-NEXT: s_and_b32 s16, s85, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 36 -; SI-NEXT: v_writelane_b32 v43, s16, 34 -; SI-NEXT: s_and_b32 s16, s86, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 37 -; SI-NEXT: s_lshr_b64 vcc, s[42:43], 8 -; SI-NEXT: v_writelane_b32 v43, s16, 37 -; SI-NEXT: s_and_b32 s16, s87, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 34 -; SI-NEXT: v_writelane_b32 v43, s16, 40 -; SI-NEXT: s_and_b32 s16, s96, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 35 -; SI-NEXT: s_lshr_b64 vcc, s[40:41], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 43 -; SI-NEXT: s_and_b32 s16, s97, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 44 -; SI-NEXT: v_writelane_b32 v43, s16, 46 -; SI-NEXT: s_and_b32 s16, s98, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 45 -; SI-NEXT: s_lshr_b64 vcc, s[40:41], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 49 -; SI-NEXT: s_and_b32 s16, s99, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 42 -; SI-NEXT: v_writelane_b32 v43, s16, 52 -; SI-NEXT: s_and_b32 s16, s76, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 43 -; SI-NEXT: s_lshr_b64 vcc, s[40:41], 8 -; SI-NEXT: v_writelane_b32 v43, s16, 55 -; SI-NEXT: s_and_b32 s16, s77, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 40 -; SI-NEXT: v_writelane_b32 v43, s16, 58 -; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 41 -; SI-NEXT: s_lshr_b64 vcc, s[14:15], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 61 -; SI-NEXT: s_and_b32 s16, s89, 0xffff -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 50 -; SI-NEXT: v_writelane_b32 v42, s16, 0 -; SI-NEXT: s_bfe_u32 s16, s19, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 51 -; SI-NEXT: s_lshr_b64 vcc, s[14:15], 16 +; SI-NEXT: v_writelane_b32 v42, s17, 60 +; SI-NEXT: s_and_b32 s17, s34, 0xffff ; SI-NEXT: v_writelane_b32 v43, s16, 18 -; SI-NEXT: s_bfe_u32 s16, s23, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 48 -; SI-NEXT: v_writelane_b32 v43, s16, 21 -; SI-NEXT: s_bfe_u32 s16, s27, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 49 -; SI-NEXT: s_lshr_b64 vcc, s[14:15], 8 -; SI-NEXT: v_writelane_b32 v43, s16, 24 -; SI-NEXT: s_bfe_u32 s16, s78, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 46 -; SI-NEXT: v_writelane_b32 v43, s16, 27 -; SI-NEXT: s_bfe_u32 s16, s84, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 47 -; SI-NEXT: s_lshr_b64 vcc, s[12:13], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 30 -; SI-NEXT: s_bfe_u32 s16, s85, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 56 -; SI-NEXT: v_writelane_b32 v43, s16, 33 -; SI-NEXT: s_bfe_u32 s16, s86, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 57 -; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 36 -; SI-NEXT: s_bfe_u32 s16, s87, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 54 -; SI-NEXT: v_writelane_b32 v43, s16, 39 -; SI-NEXT: s_bfe_u32 s16, s96, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 55 -; SI-NEXT: s_lshr_b64 vcc, s[12:13], 8 -; SI-NEXT: v_writelane_b32 v43, s16, 42 -; SI-NEXT: s_bfe_u32 s16, s97, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 52 -; SI-NEXT: v_writelane_b32 v43, s16, 45 -; SI-NEXT: s_bfe_u32 s16, s98, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 53 -; SI-NEXT: s_lshr_b64 vcc, s[10:11], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 48 -; SI-NEXT: s_bfe_u32 s16, s99, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 62 -; SI-NEXT: v_writelane_b32 v43, s16, 51 -; SI-NEXT: s_bfe_u32 s16, s76, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 63 -; SI-NEXT: s_lshr_b64 vcc, s[10:11], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 54 -; SI-NEXT: s_bfe_u32 s16, s77, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 60 -; SI-NEXT: v_writelane_b32 v43, s16, 57 -; SI-NEXT: s_bfe_u32 s16, s17, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 61 -; SI-NEXT: s_lshr_b64 vcc, s[10:11], 8 -; SI-NEXT: v_writelane_b32 v43, s16, 60 -; SI-NEXT: s_bfe_u32 s16, s89, 0x80008 -; SI-NEXT: v_writelane_b32 v41, vcc_lo, 58 -; SI-NEXT: v_writelane_b32 v43, s16, 63 -; SI-NEXT: v_writelane_b32 v41, vcc_hi, 59 -; SI-NEXT: s_lshr_b64 vcc, s[8:9], 24 -; SI-NEXT: s_mov_b32 s16, s93 -; SI-NEXT: s_lshr_b64 s[92:93], s[60:61], 8 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 4 -; SI-NEXT: s_mov_b32 s93, s16 -; SI-NEXT: s_mov_b32 s16, s71 -; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 5 -; SI-NEXT: s_lshr_b64 vcc, s[8:9], 16 -; SI-NEXT: s_mov_b32 s71, s16 -; SI-NEXT: s_mov_b32 s16, s81 -; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 2 -; SI-NEXT: s_mov_b32 s81, s16 -; SI-NEXT: s_mov_b32 s16, s83 -; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 3 -; SI-NEXT: s_lshr_b64 vcc, s[8:9], 8 -; SI-NEXT: s_mov_b32 s83, s16 -; SI-NEXT: s_mov_b32 s16, s65 -; SI-NEXT: s_lshr_b64 s[64:65], s[74:75], 24 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 0 -; SI-NEXT: s_mov_b32 s65, s16 -; SI-NEXT: s_mov_b32 s16, s67 -; SI-NEXT: s_lshr_b64 s[66:67], s[74:75], 16 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 1 -; SI-NEXT: s_lshr_b64 vcc, s[6:7], 24 -; SI-NEXT: s_mov_b32 s67, s16 -; SI-NEXT: s_mov_b32 s16, s69 -; SI-NEXT: s_lshr_b64 s[68:69], s[74:75], 8 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 10 -; SI-NEXT: s_mov_b32 s69, s16 -; SI-NEXT: s_mov_b32 s16, s51 -; SI-NEXT: s_lshr_b64 s[50:51], s[72:73], 24 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 11 -; SI-NEXT: s_lshr_b64 vcc, s[6:7], 16 -; SI-NEXT: s_mov_b32 s51, s16 -; SI-NEXT: s_mov_b32 s16, s53 -; SI-NEXT: s_lshr_b64 s[52:53], s[72:73], 16 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 8 -; SI-NEXT: s_mov_b32 s53, s16 -; SI-NEXT: s_mov_b32 s16, s55 -; SI-NEXT: s_lshr_b64 s[54:55], s[72:73], 8 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 9 -; SI-NEXT: s_lshr_b64 vcc, s[6:7], 8 -; SI-NEXT: s_mov_b32 s55, s16 -; SI-NEXT: s_mov_b32 s16, s37 -; SI-NEXT: s_lshr_b64 s[36:37], s[62:63], 24 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 6 -; SI-NEXT: s_mov_b32 s37, s16 -; SI-NEXT: s_mov_b32 s16, s39 -; SI-NEXT: s_lshr_b64 s[38:39], s[62:63], 16 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 7 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 -; SI-NEXT: s_mov_b32 s39, s16 -; SI-NEXT: s_mov_b32 s16, s49 -; SI-NEXT: s_lshr_b64 s[48:49], s[62:63], 8 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 16 -; SI-NEXT: s_mov_b32 s49, s16 -; SI-NEXT: s_mov_b32 s16, s95 -; SI-NEXT: s_lshr_b64 s[94:95], s[58:59], 24 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 17 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 16 -; SI-NEXT: s_mov_b32 s95, s16 -; SI-NEXT: s_mov_b32 s16, s31 -; SI-NEXT: s_lshr_b64 s[30:31], s[58:59], 16 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 14 -; SI-NEXT: s_mov_b32 s31, s16 -; SI-NEXT: s_mov_b32 s16, s35 -; SI-NEXT: s_lshr_b64 s[34:35], s[58:59], 8 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 15 -; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 -; SI-NEXT: s_mov_b32 s35, s16 -; SI-NEXT: s_mov_b32 s16, s91 -; SI-NEXT: s_lshr_b64 s[90:91], s[46:47], 8 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 12 -; SI-NEXT: s_mov_b32 s91, s16 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 13 +; SI-NEXT: v_writelane_b32 v42, s17, 63 +; SI-NEXT: s_and_b32 s17, s39, 0xffff +; SI-NEXT: v_writelane_b32 v43, s17, 2 +; SI-NEXT: s_and_b32 s17, s52, 0xffff +; SI-NEXT: v_writelane_b32 v43, s17, 5 +; SI-NEXT: s_and_b32 s17, s66, 0xffff +; SI-NEXT: v_writelane_b32 v43, s17, 8 +; SI-NEXT: s_and_b32 s17, s67, 0xffff +; SI-NEXT: v_writelane_b32 v43, s17, 11 +; SI-NEXT: s_and_b32 s17, s85, 0xffff +; SI-NEXT: v_writelane_b32 v43, s17, 14 +; SI-NEXT: s_and_b32 s17, s77, 0xffff +; SI-NEXT: s_bfe_u32 s18, s34, 0x80008 +; SI-NEXT: v_writelane_b32 v43, s17, 17 +; SI-NEXT: v_writelane_b32 v42, s18, 62 +; SI-NEXT: s_bfe_u32 s18, s39, 0x80008 +; SI-NEXT: v_writelane_b32 v43, s18, 1 +; SI-NEXT: s_bfe_u32 s18, s52, 0x80008 +; SI-NEXT: v_writelane_b32 v43, s18, 4 +; SI-NEXT: s_bfe_u32 s18, s66, 0x80008 +; SI-NEXT: v_writelane_b32 v43, s18, 7 +; SI-NEXT: s_bfe_u32 s18, s67, 0x80008 +; SI-NEXT: v_writelane_b32 v43, s18, 10 +; SI-NEXT: s_bfe_u32 s18, s85, 0x80008 +; SI-NEXT: v_writelane_b32 v43, s18, 13 +; SI-NEXT: s_bfe_u32 s18, s77, 0x80008 +; SI-NEXT: s_mov_b32 s30, s79 +; SI-NEXT: s_mov_b32 s94, s90 +; SI-NEXT: s_mov_b32 s79, s78 +; SI-NEXT: s_mov_b32 s90, s76 +; SI-NEXT: s_lshr_b32 s78, s75, 8 +; SI-NEXT: s_lshr_b32 s24, s73, 8 +; SI-NEXT: s_lshr_b32 s20, s63, 8 +; SI-NEXT: s_lshr_b32 s82, s61, 8 +; SI-NEXT: s_lshr_b32 s97, s59, 8 +; SI-NEXT: s_lshr_b32 s51, s57, 8 +; SI-NEXT: s_and_b32 s21, s19, 0xffff +; SI-NEXT: s_and_b32 s83, s23, 0xffff +; SI-NEXT: s_and_b32 s71, s27, 0xffff +; SI-NEXT: s_and_b32 s16, s84, 0xffff +; SI-NEXT: s_and_b32 s98, s86, 0xffff +; SI-NEXT: s_and_b32 s49, s99, 0xffff +; SI-NEXT: s_and_b32 s28, s91, 0xffff +; SI-NEXT: s_bfe_u32 s29, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s25, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s17, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s81, s84, 0x80008 +; SI-NEXT: s_bfe_u32 s76, s86, 0x80008 +; SI-NEXT: s_bfe_u32 s55, s99, 0x80008 +; SI-NEXT: s_bfe_u32 s50, s91, 0x80008 +; SI-NEXT: s_bfe_u32 s35, s88, 0x80008 +; SI-NEXT: s_bfe_u32 s26, s93, 0x80008 +; SI-NEXT: v_writelane_b32 v43, s18, 16 ; SI-NEXT: s_cbranch_execnz .LBB99_3 ; SI-NEXT: .LBB99_2: ; %cmp.true -; SI-NEXT: s_add_i32 s4, s55, 3 +; SI-NEXT: s_add_i32 s4, s94, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s5, s93, 16 +; SI-NEXT: s_lshl_b32 s5, s79, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s5, s91, 3 +; SI-NEXT: s_add_i32 s5, s90, 3 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_lshl_b32 s6, s89, 16 +; SI-NEXT: s_lshl_b32 s6, s77, 16 +; SI-NEXT: s_add_i32 s92, s92, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_add_i32 s6, s83, 3 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s7, s35, 16 +; SI-NEXT: s_and_b32 s6, s92, 0xffff +; SI-NEXT: s_lshl_b32 s7, s30, 16 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_add_i32 s7, s95, 3 +; SI-NEXT: s_add_i32 s7, s31, 3 ; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_lshl_b32 s8, s17, 16 +; SI-NEXT: s_lshl_b32 s8, s85, 16 +; SI-NEXT: s_add_i32 s69, s69, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_readlane_b32 s8, v41, 19 -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: v_readlane_b32 s9, v41, 18 -; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s8, s69, 0xffff +; SI-NEXT: s_lshl_b32 s9, s96, 16 +; SI-NEXT: s_add_i32 s80, s80, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readlane_b32 s9, v41, 17 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_lshl_b32 s10, s77, 16 +; SI-NEXT: s_and_b32 s9, s80, 0xffff +; SI-NEXT: s_lshl_b32 s10, s67, 16 +; SI-NEXT: s_add_i32 s87, s87, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_readlane_b32 s10, v41, 16 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: v_readlane_b32 s11, v41, 15 -; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_and_b32 s10, s87, 0xffff +; SI-NEXT: s_lshl_b32 s11, s65, 16 +; SI-NEXT: s_add_i32 s70, s70, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_readlane_b32 s11, v41, 14 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_and_b32 s11, s11, 0xffff -; SI-NEXT: s_lshl_b32 s12, s76, 16 +; SI-NEXT: s_and_b32 s11, s70, 0xffff +; SI-NEXT: s_lshl_b32 s12, s66, 16 +; SI-NEXT: s_add_i32 s68, s68, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v41, 13 -; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: v_readlane_b32 s13, v41, 12 -; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s12, s68, 0xffff +; SI-NEXT: s_lshl_b32 s13, s64, 16 +; SI-NEXT: s_add_i32 s54, s54, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v41, 11 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_and_b32 s13, s13, 0xffff -; SI-NEXT: s_lshl_b32 s14, s99, 16 +; SI-NEXT: s_and_b32 s13, s54, 0xffff +; SI-NEXT: s_lshl_b32 s14, s52, 16 +; SI-NEXT: s_add_i32 s53, s53, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: v_readlane_b32 s14, v41, 10 -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: v_readlane_b32 s15, v41, 9 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_and_b32 s14, s53, 0xffff +; SI-NEXT: s_lshl_b32 s15, s89, 16 +; SI-NEXT: s_add_i32 s48, s48, 3 ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_readlane_b32 s15, v41, 8 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_lshl_b32 s16, s98, 16 +; SI-NEXT: s_and_b32 s15, s48, 0xffff +; SI-NEXT: s_lshl_b32 s16, s39, 16 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_readlane_b32 s16, v41, 7 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v41, 6 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s16, s22, 0xffff +; SI-NEXT: s_lshl_b32 s17, s38, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s37, s37, 3 ; SI-NEXT: s_add_i32 s40, s16, 0x30000 -; SI-NEXT: v_readlane_b32 s16, v41, 5 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s97, 16 +; SI-NEXT: s_and_b32 s16, s37, 0xffff +; SI-NEXT: s_lshl_b32 s17, s34, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s36, s36, 3 ; SI-NEXT: s_add_i32 s41, s16, 0x30000 -; SI-NEXT: v_readlane_b32 s16, v41, 4 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v41, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_and_b32 s16, s36, 0xffff +; SI-NEXT: s_lshl_b32 s17, s95, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s42, s16, 0x30000 -; SI-NEXT: v_readlane_b32 s16, v41, 2 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s96, 16 +; SI-NEXT: v_readlane_b32 s16, v41, 24 +; SI-NEXT: s_add_i32 s95, s16, 3 +; SI-NEXT: s_and_b32 s16, s95, 0xffff +; SI-NEXT: s_lshl_b32 s17, s93, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s43, s16, 0x30000 -; SI-NEXT: v_readlane_b32 s16, v41, 1 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v41, 0 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_readlane_b32 s16, v41, 23 +; SI-NEXT: s_add_i32 s94, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 22 +; SI-NEXT: s_and_b32 s16, s94, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s44, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 21 +; SI-NEXT: s_add_i32 s79, s16, 3 +; SI-NEXT: s_and_b32 s16, s79, 0xffff +; SI-NEXT: s_lshl_b32 s17, s88, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s45, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 20 +; SI-NEXT: s_add_i32 s90, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 19 +; SI-NEXT: s_and_b32 s16, s90, 0xffff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s46, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s81, 3 +; SI-NEXT: v_readlane_b32 s16, v41, 18 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s87, 16 +; SI-NEXT: s_lshl_b32 s17, s91, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s47, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s71, 3 +; SI-NEXT: v_readlane_b32 s16, v41, 17 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 16 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s69, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s58, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s67, 3 +; SI-NEXT: s_add_i32 s56, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 15 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s86, 16 +; SI-NEXT: s_lshl_b32 s17, s99, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s59, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s65, 3 +; SI-NEXT: s_add_i32 s57, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 14 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 12 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s62, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s51, 3 +; SI-NEXT: s_add_i32 s58, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 11 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s85, 16 +; SI-NEXT: s_lshl_b32 s17, s86, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s63, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s49, 3 +; SI-NEXT: s_add_i32 s59, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 10 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 9 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s37, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s72, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s31, 3 +; SI-NEXT: s_add_i32 s60, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 13 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_lshl_b32 s17, s84, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s73, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s28, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s29, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s74, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s39, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s78, 16 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s75, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s24, 3 +; SI-NEXT: s_add_i32 s61, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 7 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s25, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s44, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s26, 3 +; SI-NEXT: s_add_i32 s62, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 6 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_lshl_b32 s17, s27, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s45, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s20, 3 +; SI-NEXT: s_add_i32 s63, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 5 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 4 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s21, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s56, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s22, 3 +; SI-NEXT: s_add_i32 s72, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_lshl_b32 s17, s23, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s57, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s79, 3 +; SI-NEXT: s_add_i32 s73, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 2 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 1 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s88, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s60, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s18, 3 +; SI-NEXT: s_add_i32 s74, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 0 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_lshl_b32 s17, s19, 16 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s61, s16, 0x30000 +; SI-NEXT: s_add_i32 s75, s16, 0x30000 +; SI-NEXT: s_lshr_b64 s[16:17], s[74:75], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 29 +; SI-NEXT: v_writelane_b32 v41, s17, 30 +; SI-NEXT: s_lshr_b64 s[16:17], s[74:75], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 27 +; SI-NEXT: v_writelane_b32 v41, s17, 28 +; SI-NEXT: s_lshr_b64 s[16:17], s[74:75], 8 +; SI-NEXT: v_writelane_b32 v41, s16, 25 +; SI-NEXT: v_writelane_b32 v41, s17, 26 +; SI-NEXT: s_lshr_b64 s[16:17], s[72:73], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 35 +; SI-NEXT: v_writelane_b32 v41, s17, 36 +; SI-NEXT: s_lshr_b64 s[16:17], s[72:73], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 33 +; SI-NEXT: v_writelane_b32 v41, s17, 34 +; SI-NEXT: s_lshr_b64 s[16:17], s[72:73], 8 +; SI-NEXT: v_writelane_b32 v41, s16, 31 +; SI-NEXT: v_writelane_b32 v41, s17, 32 +; SI-NEXT: s_lshr_b64 s[16:17], s[62:63], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 41 +; SI-NEXT: v_writelane_b32 v41, s17, 42 +; SI-NEXT: s_lshr_b64 s[16:17], s[62:63], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 39 +; SI-NEXT: v_writelane_b32 v41, s17, 40 +; SI-NEXT: s_lshr_b64 s[16:17], s[62:63], 8 +; SI-NEXT: v_writelane_b32 v41, s16, 37 +; SI-NEXT: v_writelane_b32 v41, s17, 38 ; SI-NEXT: s_lshr_b64 s[16:17], s[60:61], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 22 -; SI-NEXT: v_writelane_b32 v41, s17, 23 +; SI-NEXT: v_writelane_b32 v41, s16, 47 +; SI-NEXT: v_writelane_b32 v41, s17, 48 ; SI-NEXT: s_lshr_b64 s[16:17], s[60:61], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 20 -; SI-NEXT: s_lshr_b32 s16, s61, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 18 -; SI-NEXT: s_lshr_b32 s16, s61, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 19 -; SI-NEXT: s_lshr_b32 s16, s61, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 20 -; SI-NEXT: s_lshr_b32 s16, s57, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 21 -; SI-NEXT: s_lshr_b32 s16, s57, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 22 -; SI-NEXT: s_lshr_b32 s16, s57, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 23 -; SI-NEXT: s_lshr_b32 s16, s45, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 24 -; SI-NEXT: s_lshr_b32 s16, s45, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 25 -; SI-NEXT: s_lshr_b32 s16, s45, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 26 -; SI-NEXT: s_lshr_b32 s16, s75, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 27 -; SI-NEXT: s_lshr_b32 s16, s75, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 28 -; SI-NEXT: s_lshr_b32 s16, s75, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 29 -; SI-NEXT: s_lshr_b32 s16, s73, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 30 -; SI-NEXT: s_lshr_b32 s16, s73, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 31 -; SI-NEXT: s_lshr_b32 s16, s73, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 32 -; SI-NEXT: s_lshr_b32 s16, s63, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 33 -; SI-NEXT: s_lshr_b32 s16, s63, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 34 -; SI-NEXT: s_lshr_b32 s16, s63, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 35 -; SI-NEXT: s_lshr_b32 s16, s59, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 36 -; SI-NEXT: s_lshr_b32 s16, s59, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 37 -; SI-NEXT: s_lshr_b32 s16, s59, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 38 -; SI-NEXT: s_lshr_b32 s16, s47, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 39 -; SI-NEXT: s_lshr_b32 s16, s47, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 40 -; SI-NEXT: s_lshr_b32 s16, s47, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 41 -; SI-NEXT: s_lshr_b32 s16, s43, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 42 -; SI-NEXT: s_lshr_b32 s16, s43, 16 -; SI-NEXT: v_writelane_b32 v43, s16, 43 -; SI-NEXT: s_lshr_b32 s16, s43, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 44 -; SI-NEXT: s_lshr_b32 s16, s41, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 45 -; SI-NEXT: s_lshr_b32 s16, s41, 16 -; SI-NEXT: s_add_i32 s15, s15, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 46 -; SI-NEXT: s_lshr_b32 s16, s41, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 47 -; SI-NEXT: s_lshr_b32 s16, s15, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 48 -; SI-NEXT: s_lshr_b32 s16, s15, 16 -; SI-NEXT: s_add_i32 s13, s13, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 49 -; SI-NEXT: s_lshr_b32 s16, s15, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 50 -; SI-NEXT: s_lshr_b32 s16, s13, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 51 -; SI-NEXT: s_lshr_b32 s16, s13, 16 -; SI-NEXT: s_add_i32 s11, s11, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 52 -; SI-NEXT: s_lshr_b32 s16, s13, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 53 -; SI-NEXT: s_lshr_b32 s16, s11, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 54 -; SI-NEXT: s_lshr_b32 s16, s11, 16 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 55 -; SI-NEXT: s_lshr_b32 s16, s11, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 56 -; SI-NEXT: s_lshr_b32 s16, s9, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 57 -; SI-NEXT: s_lshr_b32 s16, s9, 16 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 58 -; SI-NEXT: s_lshr_b32 s16, s9, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 59 -; SI-NEXT: s_lshr_b32 s16, s7, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 60 -; SI-NEXT: s_lshr_b32 s16, s7, 16 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 61 -; SI-NEXT: s_lshr_b32 s16, s7, 8 -; SI-NEXT: v_writelane_b32 v43, s16, 62 -; SI-NEXT: s_lshr_b32 s16, s5, 24 -; SI-NEXT: v_writelane_b32 v43, s16, 63 -; SI-NEXT: s_lshr_b32 s16, s5, 16 -; SI-NEXT: v_writelane_b32 v42, s16, 0 -; SI-NEXT: s_lshr_b32 s16, s5, 8 -; SI-NEXT: v_writelane_b32 v41, s17, 21 -; SI-NEXT: v_writelane_b32 v42, s16, 1 +; SI-NEXT: v_writelane_b32 v41, s16, 45 +; SI-NEXT: v_writelane_b32 v41, s17, 46 +; SI-NEXT: s_lshr_b64 s[16:17], s[60:61], 8 +; SI-NEXT: v_writelane_b32 v41, s16, 43 +; SI-NEXT: v_writelane_b32 v41, s17, 44 +; SI-NEXT: s_lshr_b64 s[16:17], s[58:59], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 53 +; SI-NEXT: v_writelane_b32 v41, s17, 54 +; SI-NEXT: s_lshr_b64 s[16:17], s[58:59], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 51 +; SI-NEXT: v_writelane_b32 v41, s17, 52 +; SI-NEXT: s_lshr_b64 s[16:17], s[58:59], 8 +; SI-NEXT: v_writelane_b32 v41, s16, 49 +; SI-NEXT: v_writelane_b32 v41, s17, 50 ; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 28 -; SI-NEXT: v_writelane_b32 v41, s17, 29 +; SI-NEXT: v_writelane_b32 v41, s16, 59 +; SI-NEXT: v_writelane_b32 v41, s17, 60 ; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 26 -; SI-NEXT: v_writelane_b32 v41, s17, 27 +; SI-NEXT: v_writelane_b32 v41, s16, 57 +; SI-NEXT: v_writelane_b32 v41, s17, 58 ; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 8 -; SI-NEXT: v_writelane_b32 v41, s16, 24 -; SI-NEXT: v_writelane_b32 v41, s17, 25 +; SI-NEXT: v_writelane_b32 v41, s16, 55 +; SI-NEXT: v_writelane_b32 v41, s17, 56 ; SI-NEXT: s_lshr_b64 s[16:17], s[46:47], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 32 -; SI-NEXT: v_writelane_b32 v41, s17, 33 +; SI-NEXT: v_writelane_b32 v42, s16, 1 +; SI-NEXT: v_writelane_b32 v42, s17, 2 ; SI-NEXT: s_lshr_b64 s[16:17], s[46:47], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 30 -; SI-NEXT: v_writelane_b32 v41, s17, 31 +; SI-NEXT: v_writelane_b32 v41, s16, 63 +; SI-NEXT: v_writelane_b32 v42, s17, 0 +; SI-NEXT: s_lshr_b64 s[16:17], s[46:47], 8 +; SI-NEXT: v_writelane_b32 v41, s16, 61 +; SI-NEXT: v_writelane_b32 v41, s17, 62 +; SI-NEXT: s_lshr_b64 s[16:17], s[44:45], 24 +; SI-NEXT: v_writelane_b32 v42, s16, 7 +; SI-NEXT: v_writelane_b32 v42, s17, 8 +; SI-NEXT: s_lshr_b64 s[16:17], s[44:45], 16 +; SI-NEXT: v_writelane_b32 v42, s16, 5 +; SI-NEXT: v_writelane_b32 v42, s17, 6 +; SI-NEXT: s_lshr_b64 s[16:17], s[44:45], 8 +; SI-NEXT: v_writelane_b32 v42, s16, 3 +; SI-NEXT: v_writelane_b32 v42, s17, 4 ; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 38 -; SI-NEXT: v_writelane_b32 v41, s17, 39 +; SI-NEXT: v_writelane_b32 v42, s16, 13 +; SI-NEXT: v_writelane_b32 v42, s17, 14 ; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 36 -; SI-NEXT: v_writelane_b32 v41, s17, 37 +; SI-NEXT: v_writelane_b32 v42, s16, 11 +; SI-NEXT: v_writelane_b32 v42, s17, 12 ; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 8 -; SI-NEXT: v_writelane_b32 v41, s16, 34 -; SI-NEXT: v_writelane_b32 v41, s17, 35 +; SI-NEXT: v_writelane_b32 v42, s16, 9 +; SI-NEXT: v_writelane_b32 v42, s17, 10 ; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 44 -; SI-NEXT: v_writelane_b32 v41, s17, 45 +; SI-NEXT: v_writelane_b32 v42, s16, 19 +; SI-NEXT: v_writelane_b32 v42, s17, 20 ; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 42 -; SI-NEXT: v_writelane_b32 v41, s17, 43 +; SI-NEXT: v_writelane_b32 v42, s16, 17 +; SI-NEXT: v_writelane_b32 v42, s17, 18 ; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 8 ; SI-NEXT: s_add_i32 s14, s14, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s16, 40 -; SI-NEXT: v_writelane_b32 v41, s17, 41 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: v_writelane_b32 v42, s16, 15 +; SI-NEXT: v_writelane_b32 v42, s17, 16 ; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 50 -; SI-NEXT: v_writelane_b32 v41, s17, 51 +; SI-NEXT: v_writelane_b32 v42, s16, 25 +; SI-NEXT: v_writelane_b32 v42, s17, 26 ; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 48 -; SI-NEXT: v_writelane_b32 v41, s17, 49 +; SI-NEXT: v_writelane_b32 v42, s16, 23 +; SI-NEXT: v_writelane_b32 v42, s17, 24 ; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 ; SI-NEXT: s_add_i32 s12, s12, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s16, 46 -; SI-NEXT: v_writelane_b32 v41, s17, 47 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: v_writelane_b32 v42, s16, 21 +; SI-NEXT: v_writelane_b32 v42, s17, 22 ; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 56 -; SI-NEXT: v_writelane_b32 v41, s17, 57 +; SI-NEXT: v_writelane_b32 v42, s16, 31 +; SI-NEXT: v_writelane_b32 v42, s17, 32 ; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 54 -; SI-NEXT: v_writelane_b32 v41, s17, 55 +; SI-NEXT: v_writelane_b32 v42, s16, 29 +; SI-NEXT: v_writelane_b32 v42, s17, 30 ; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 8 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s16, 52 -; SI-NEXT: v_writelane_b32 v41, s17, 53 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: v_writelane_b32 v42, s16, 27 +; SI-NEXT: v_writelane_b32 v42, s17, 28 ; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 24 -; SI-NEXT: v_writelane_b32 v41, s16, 62 -; SI-NEXT: v_writelane_b32 v41, s17, 63 +; SI-NEXT: v_writelane_b32 v42, s16, 37 +; SI-NEXT: v_writelane_b32 v42, s17, 38 ; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 16 -; SI-NEXT: v_writelane_b32 v41, s16, 60 -; SI-NEXT: v_writelane_b32 v41, s17, 61 +; SI-NEXT: v_writelane_b32 v42, s16, 35 +; SI-NEXT: v_writelane_b32 v42, s17, 36 ; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 8 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s16, 58 -; SI-NEXT: v_writelane_b32 v41, s17, 59 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: v_writelane_b32 v42, s16, 33 +; SI-NEXT: v_writelane_b32 v42, s17, 34 ; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 4 -; SI-NEXT: v_writelane_b32 v43, s17, 5 +; SI-NEXT: v_writelane_b32 v42, s16, 43 +; SI-NEXT: v_writelane_b32 v42, s17, 44 ; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 2 -; SI-NEXT: v_writelane_b32 v43, s17, 3 +; SI-NEXT: v_writelane_b32 v42, s16, 41 +; SI-NEXT: v_writelane_b32 v42, s17, 42 ; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 8 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 0 -; SI-NEXT: v_writelane_b32 v43, s17, 1 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: v_writelane_b32 v42, s16, 39 +; SI-NEXT: v_writelane_b32 v42, s17, 40 ; SI-NEXT: s_lshr_b64 s[16:17], s[6:7], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 10 -; SI-NEXT: v_writelane_b32 v43, s17, 11 +; SI-NEXT: v_writelane_b32 v42, s16, 49 +; SI-NEXT: v_writelane_b32 v42, s17, 50 ; SI-NEXT: s_lshr_b64 s[16:17], s[6:7], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 8 -; SI-NEXT: v_writelane_b32 v43, s17, 9 +; SI-NEXT: v_writelane_b32 v42, s16, 47 +; SI-NEXT: v_writelane_b32 v42, s17, 48 ; SI-NEXT: s_lshr_b64 s[16:17], s[6:7], 8 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: v_writelane_b32 v43, s16, 6 -; SI-NEXT: v_writelane_b32 v43, s17, 7 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: v_writelane_b32 v42, s16, 45 +; SI-NEXT: v_writelane_b32 v42, s17, 46 ; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 24 -; SI-NEXT: v_writelane_b32 v43, s16, 16 -; SI-NEXT: v_writelane_b32 v43, s17, 17 +; SI-NEXT: v_writelane_b32 v42, s16, 55 +; SI-NEXT: v_writelane_b32 v42, s17, 56 ; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 -; SI-NEXT: v_writelane_b32 v43, s16, 14 -; SI-NEXT: v_writelane_b32 v43, s17, 15 +; SI-NEXT: v_writelane_b32 v42, s16, 53 +; SI-NEXT: v_writelane_b32 v42, s17, 54 ; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[60:61], 8 -; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 -; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 -; SI-NEXT: s_lshr_b64 s[64:65], s[74:75], 24 -; SI-NEXT: s_lshr_b64 s[66:67], s[74:75], 16 -; SI-NEXT: s_lshr_b64 s[68:69], s[74:75], 8 -; SI-NEXT: s_lshr_b64 s[50:51], s[72:73], 24 -; SI-NEXT: s_lshr_b64 s[52:53], s[72:73], 16 -; SI-NEXT: s_lshr_b64 s[54:55], s[72:73], 8 -; SI-NEXT: s_lshr_b64 s[36:37], s[62:63], 24 -; SI-NEXT: s_lshr_b64 s[38:39], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[48:49], s[62:63], 8 -; SI-NEXT: s_lshr_b64 s[94:95], s[58:59], 24 -; SI-NEXT: s_lshr_b64 s[30:31], s[58:59], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[58:59], 8 -; SI-NEXT: s_lshr_b64 s[90:91], s[46:47], 8 -; SI-NEXT: v_writelane_b32 v43, s16, 12 -; SI-NEXT: v_writelane_b32 v43, s17, 13 +; SI-NEXT: v_writelane_b32 v42, s16, 51 +; SI-NEXT: v_writelane_b32 v42, s17, 52 +; SI-NEXT: s_lshr_b32 s18, s47, 8 +; SI-NEXT: v_writelane_b32 v42, s18, 57 +; SI-NEXT: s_lshr_b32 s18, s45, 16 +; SI-NEXT: v_writelane_b32 v42, s18, 58 +; SI-NEXT: s_lshr_b32 s18, s45, 8 +; SI-NEXT: v_writelane_b32 v42, s18, 59 +; SI-NEXT: s_lshr_b32 s18, s43, 16 +; SI-NEXT: v_writelane_b32 v42, s18, 60 +; SI-NEXT: s_lshr_b32 s18, s43, 8 +; SI-NEXT: v_writelane_b32 v42, s18, 61 +; SI-NEXT: s_lshr_b32 s18, s41, 24 +; SI-NEXT: v_writelane_b32 v42, s18, 62 +; SI-NEXT: s_lshr_b32 s18, s41, 16 +; SI-NEXT: v_writelane_b32 v42, s18, 63 +; SI-NEXT: s_lshr_b32 s18, s41, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v43, s18, 0 +; SI-NEXT: s_lshr_b32 s18, s15, 24 +; SI-NEXT: v_writelane_b32 v43, s18, 1 +; SI-NEXT: s_lshr_b32 s18, s15, 16 +; SI-NEXT: v_writelane_b32 v43, s18, 2 +; SI-NEXT: s_lshr_b32 s18, s15, 8 +; SI-NEXT: v_writelane_b32 v43, s18, 3 +; SI-NEXT: s_lshr_b32 s18, s13, 24 +; SI-NEXT: v_writelane_b32 v43, s18, 4 +; SI-NEXT: s_lshr_b32 s18, s13, 16 +; SI-NEXT: v_writelane_b32 v43, s18, 5 +; SI-NEXT: s_lshr_b32 s18, s13, 8 +; SI-NEXT: v_writelane_b32 v43, s18, 6 +; SI-NEXT: s_lshr_b32 s18, s11, 24 +; SI-NEXT: v_writelane_b32 v43, s18, 7 +; SI-NEXT: s_lshr_b32 s18, s11, 16 +; SI-NEXT: v_writelane_b32 v43, s18, 8 +; SI-NEXT: s_lshr_b32 s18, s11, 8 +; SI-NEXT: v_writelane_b32 v43, s18, 9 +; SI-NEXT: s_lshr_b32 s18, s9, 24 +; SI-NEXT: v_writelane_b32 v43, s18, 10 +; SI-NEXT: s_lshr_b32 s18, s9, 16 +; SI-NEXT: v_writelane_b32 v43, s18, 11 +; SI-NEXT: s_lshr_b32 s18, s9, 8 +; SI-NEXT: v_writelane_b32 v43, s18, 12 +; SI-NEXT: s_lshr_b32 s18, s7, 24 +; SI-NEXT: v_writelane_b32 v43, s18, 13 +; SI-NEXT: s_lshr_b32 s18, s7, 16 +; SI-NEXT: v_writelane_b32 v43, s18, 14 +; SI-NEXT: s_lshr_b32 s18, s7, 8 +; SI-NEXT: v_writelane_b32 v43, s18, 15 +; SI-NEXT: s_lshr_b32 s18, s5, 24 +; SI-NEXT: v_writelane_b32 v43, s18, 16 +; SI-NEXT: s_lshr_b32 s18, s5, 16 +; SI-NEXT: s_lshr_b32 s29, s75, 24 +; SI-NEXT: s_lshr_b32 s21, s75, 16 +; SI-NEXT: s_lshr_b32 s78, s75, 8 +; SI-NEXT: s_lshr_b32 s25, s73, 24 +; SI-NEXT: s_lshr_b32 s83, s73, 16 +; SI-NEXT: s_lshr_b32 s24, s73, 8 +; SI-NEXT: s_lshr_b32 s17, s63, 24 +; SI-NEXT: s_lshr_b32 s71, s63, 16 +; SI-NEXT: s_lshr_b32 s20, s63, 8 +; SI-NEXT: s_lshr_b32 s81, s61, 24 +; SI-NEXT: s_lshr_b32 s16, s61, 16 +; SI-NEXT: s_lshr_b32 s82, s61, 8 +; SI-NEXT: s_lshr_b32 s76, s59, 24 +; SI-NEXT: s_lshr_b32 s98, s59, 16 +; SI-NEXT: s_lshr_b32 s97, s59, 8 +; SI-NEXT: s_lshr_b32 s55, s57, 24 +; SI-NEXT: s_lshr_b32 s49, s57, 16 +; SI-NEXT: s_lshr_b32 s51, s57, 8 +; SI-NEXT: s_lshr_b32 s50, s47, 24 +; SI-NEXT: s_lshr_b32 s28, s47, 16 +; SI-NEXT: s_lshr_b32 s35, s45, 24 +; SI-NEXT: s_lshr_b32 s26, s43, 24 +; SI-NEXT: v_writelane_b32 v43, s18, 17 +; SI-NEXT: s_lshr_b32 s18, s5, 8 +; SI-NEXT: v_writelane_b32 v43, s18, 18 ; SI-NEXT: .LBB99_3: ; %end -; SI-NEXT: s_lshl_b32 s17, s92, 8 -; SI-NEXT: s_and_b32 s18, s60, 0xff +; SI-NEXT: v_readlane_b32 s22, v41, 25 +; SI-NEXT: s_and_b32 s18, s74, 0xff +; SI-NEXT: s_lshl_b32 s19, s22, 8 +; SI-NEXT: v_readlane_b32 s22, v41, 27 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s22, 0xff +; SI-NEXT: v_readlane_b32 s22, v41, 29 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s22, s22, 24 +; SI-NEXT: s_or_b32 s19, s22, s19 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: v_mov_b32_e32 v1, s18 +; SI-NEXT: s_and_b32 s18, s75, 0xff +; SI-NEXT: s_lshl_b32 s19, s78, 8 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s21, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s21, s29, 24 +; SI-NEXT: s_or_b32 s19, s21, s19 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: v_readlane_b32 s22, v41, 31 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: s_and_b32 s18, s72, 0xff +; SI-NEXT: s_lshl_b32 s19, s22, 8 +; SI-NEXT: v_readlane_b32 s22, v41, 33 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s22, 0xff +; SI-NEXT: v_readlane_b32 s22, v41, 35 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s21, s22, 24 +; SI-NEXT: s_or_b32 s19, s21, s19 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: v_mov_b32_e32 v3, s18 +; SI-NEXT: s_and_b32 s18, s73, 0xff +; SI-NEXT: s_lshl_b32 s19, s24, 8 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s83, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s21, s25, 24 +; SI-NEXT: s_or_b32 s19, s21, s19 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: v_readlane_b32 s22, v41, 37 +; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: s_and_b32 s18, s62, 0xff +; SI-NEXT: s_lshl_b32 s19, s22, 8 +; SI-NEXT: v_readlane_b32 s22, v41, 39 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s22, 0xff +; SI-NEXT: v_readlane_b32 s22, v41, 41 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s21, s22, 24 +; SI-NEXT: s_or_b32 s19, s21, s19 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: v_mov_b32_e32 v5, s18 +; SI-NEXT: s_and_b32 s18, s63, 0xff +; SI-NEXT: s_lshl_b32 s19, s20, 8 +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: s_and_b32 s19, s71, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_or_b32 s17, s17, s19 +; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v41, 20 -; SI-NEXT: v_readlane_b32 s19, v41, 21 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: v_readlane_b32 s20, v41, 22 -; SI-NEXT: s_lshl_b32 s19, s20, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 20 -; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: s_and_b32 s17, s61, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 19 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 18 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: v_readlane_b32 s18, v41, 43 +; SI-NEXT: v_mov_b32_e32 v6, s17 +; SI-NEXT: s_and_b32 s17, s60, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: v_readlane_b32 s19, v41, 44 ; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_readlane_b32 s16, v41, 24 -; SI-NEXT: v_readlane_b32 s17, v41, 25 -; SI-NEXT: s_lshl_b32 s17, s16, 8 -; SI-NEXT: s_and_b32 s18, s56, 0xff -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v41, 26 -; SI-NEXT: v_readlane_b32 s19, v41, 27 +; SI-NEXT: v_readlane_b32 s18, v41, 45 +; SI-NEXT: v_readlane_b32 s19, v41, 46 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: v_readlane_b32 s20, v41, 28 -; SI-NEXT: s_lshl_b32 s19, s20, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 23 -; SI-NEXT: v_mov_b32_e32 v3, s17 -; SI-NEXT: s_and_b32 s17, s57, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 22 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 21 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_mov_b32_e32 v4, s17 -; SI-NEXT: s_lshl_b32 s17, s82, 8 -; SI-NEXT: s_and_b32 s18, s44, 0xff -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s18, s80, 0xff -; SI-NEXT: s_lshl_b32 s19, s70, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 26 -; SI-NEXT: v_mov_b32_e32 v5, s17 -; SI-NEXT: s_and_b32 s17, s45, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 25 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_mov_b32_e32 v6, s17 -; SI-NEXT: s_lshl_b32 s17, s68, 8 -; SI-NEXT: s_and_b32 s18, s74, 0xff -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s18, s66, 0xff -; SI-NEXT: s_lshl_b32 s19, s64, 24 +; SI-NEXT: v_readlane_b32 s20, v41, 47 ; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s20, 24 ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 29 ; SI-NEXT: v_mov_b32_e32 v7, s17 -; SI-NEXT: s_and_b32 s17, s75, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 28 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 27 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_mov_b32_e32 v8, s17 -; SI-NEXT: s_lshl_b32 s17, s54, 8 -; SI-NEXT: s_and_b32 s18, s72, 0xff -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s18, s52, 0xff -; SI-NEXT: s_lshl_b32 s19, s50, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 32 -; SI-NEXT: v_mov_b32_e32 v9, s17 -; SI-NEXT: s_and_b32 s17, s73, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 31 +; SI-NEXT: s_and_b32 s17, s61, 0xff +; SI-NEXT: s_lshl_b32 s18, s82, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 30 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s18, s81, 24 +; SI-NEXT: s_or_b32 s16, s18, s16 ; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_mov_b32_e32 v10, s17 -; SI-NEXT: s_lshl_b32 s17, s48, 8 -; SI-NEXT: s_and_b32 s18, s62, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s18, v41, 49 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: s_and_b32 s16, s58, 0xff +; SI-NEXT: s_lshl_b32 s17, s18, 8 +; SI-NEXT: v_readlane_b32 s18, v41, 51 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v41, 53 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s18, s38, 0xff -; SI-NEXT: s_lshl_b32 s19, s36, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 35 -; SI-NEXT: v_mov_b32_e32 v11, s17 -; SI-NEXT: s_and_b32 s17, s63, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 34 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 33 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_mov_b32_e32 v12, s17 -; SI-NEXT: s_lshl_b32 s17, s34, 8 -; SI-NEXT: s_and_b32 s18, s58, 0xff +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: s_and_b32 s16, s59, 0xff +; SI-NEXT: s_lshl_b32 s17, s97, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s98, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s76, 24 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s18, s30, 0xff -; SI-NEXT: s_lshl_b32 s19, s94, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 38 -; SI-NEXT: v_mov_b32_e32 v13, s17 -; SI-NEXT: s_and_b32 s17, s59, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 37 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 36 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_mov_b32_e32 v14, s17 -; SI-NEXT: s_lshl_b32 s17, s90, 8 -; SI-NEXT: s_and_b32 s18, s46, 0xff +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s18, v41, 55 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: s_and_b32 s16, s56, 0xff +; SI-NEXT: s_lshl_b32 s17, s18, 8 +; SI-NEXT: v_readlane_b32 s18, v41, 57 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v41, 59 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v41, 30 -; SI-NEXT: v_readlane_b32 s19, v41, 31 -; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: v_readlane_b32 s20, v41, 32 -; SI-NEXT: s_lshl_b32 s19, s20, 24 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: v_readlane_b32 s16, v43, 41 -; SI-NEXT: v_mov_b32_e32 v15, s17 -; SI-NEXT: s_and_b32 s17, s47, 0xff -; SI-NEXT: s_lshl_b32 s18, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v43, 40 -; SI-NEXT: s_or_b32 s17, s17, s18 -; SI-NEXT: s_and_b32 s18, s16, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 39 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: s_and_b32 s16, s57, 0xff +; SI-NEXT: s_lshl_b32 s17, s51, 8 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s17, s49, 0xff ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: v_mov_b32_e32 v16, s17 -; SI-NEXT: v_readlane_b32 s16, v41, 34 -; SI-NEXT: v_readlane_b32 s17, v41, 35 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s55, 24 ; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: s_and_b32 s17, s42, 0xff -; SI-NEXT: v_readlane_b32 s18, v41, 36 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_and_b32 s17, s18, 0xff -; SI-NEXT: v_readlane_b32 s18, v41, 38 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s18, v41, 61 ; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s46, 0xff +; SI-NEXT: s_lshl_b32 s17, s18, 8 +; SI-NEXT: v_readlane_b32 s18, v41, 63 ; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v42, 1 ; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: v_readlane_b32 s17, v43, 44 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v42, 57 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: s_and_b32 s16, s43, 0xff +; SI-NEXT: s_and_b32 s16, s47, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s28, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s50, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v43, 43 -; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s18, v42, 3 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s44, 0xff +; SI-NEXT: s_lshl_b32 s17, s18, 8 +; SI-NEXT: v_readlane_b32 s18, v42, 5 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v42, 7 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v42, 59 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s45, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v42, 58 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 42 -; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s35, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s18, v42, 9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s42, 0xff +; SI-NEXT: s_lshl_b32 s17, s18, 8 +; SI-NEXT: v_readlane_b32 s18, v42, 11 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v42, 13 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v42, 61 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s43, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v42, 60 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s26, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s18, v41, 40 +; SI-NEXT: v_readlane_b32 s18, v42, 15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s40, 0xff ; SI-NEXT: s_lshl_b32 s17, s18, 8 -; SI-NEXT: v_readlane_b32 s18, v41, 42 +; SI-NEXT: v_readlane_b32 s18, v42, 17 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s18, 0xff -; SI-NEXT: v_readlane_b32 s18, v41, 44 +; SI-NEXT: v_readlane_b32 s18, v42, 19 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v43, 47 +; SI-NEXT: v_readlane_b32 s17, v43, 0 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s41, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v43, 46 +; SI-NEXT: v_readlane_b32 s17, v42, 63 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 45 +; SI-NEXT: v_readlane_b32 s18, v42, 62 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff @@ -215308,15 +214796,15 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: v_readlane_b32 s16, v41, 46 +; SI-NEXT: v_readlane_b32 s16, v42, 21 ; SI-NEXT: s_and_b32 s14, s14, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s17, v41, 47 +; SI-NEXT: v_readlane_b32 s17, v42, 22 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: v_readlane_b32 s16, v41, 48 -; SI-NEXT: v_readlane_b32 s17, v41, 49 +; SI-NEXT: v_readlane_b32 s16, v42, 23 +; SI-NEXT: v_readlane_b32 s17, v42, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s18, v41, 50 +; SI-NEXT: v_readlane_b32 s18, v42, 25 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s18, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff @@ -215327,12 +214815,12 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xff -; SI-NEXT: v_readlane_b32 s15, v43, 50 +; SI-NEXT: v_readlane_b32 s15, v43, 3 ; SI-NEXT: s_lshl_b32 s15, s15, 8 ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_readlane_b32 s15, v43, 49 +; SI-NEXT: v_readlane_b32 s15, v43, 2 ; SI-NEXT: s_and_b32 s15, s15, 0xff -; SI-NEXT: v_readlane_b32 s16, v43, 48 +; SI-NEXT: v_readlane_b32 s16, v43, 1 ; SI-NEXT: s_lshl_b32 s15, s15, 16 ; SI-NEXT: s_lshl_b32 s16, s16, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff @@ -215342,15 +214830,15 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: v_readlane_b32 s14, v41, 52 +; SI-NEXT: v_readlane_b32 s14, v42, 27 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 8 -; SI-NEXT: v_readlane_b32 s15, v41, 53 +; SI-NEXT: v_readlane_b32 s15, v42, 28 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: v_readlane_b32 s14, v41, 54 -; SI-NEXT: v_readlane_b32 s15, v41, 55 +; SI-NEXT: v_readlane_b32 s14, v42, 29 +; SI-NEXT: v_readlane_b32 s15, v42, 30 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s16, v41, 56 +; SI-NEXT: v_readlane_b32 s16, v42, 31 ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_lshl_b32 s15, s16, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff @@ -215361,12 +214849,12 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff -; SI-NEXT: v_readlane_b32 s13, v43, 53 +; SI-NEXT: v_readlane_b32 s13, v43, 6 ; SI-NEXT: s_lshl_b32 s13, s13, 8 ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: v_readlane_b32 s13, v43, 52 +; SI-NEXT: v_readlane_b32 s13, v43, 5 ; SI-NEXT: s_and_b32 s13, s13, 0xff -; SI-NEXT: v_readlane_b32 s14, v43, 51 +; SI-NEXT: v_readlane_b32 s14, v43, 4 ; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_lshl_b32 s14, s14, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff @@ -215376,15 +214864,15 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 -; SI-NEXT: v_readlane_b32 s12, v41, 58 +; SI-NEXT: v_readlane_b32 s12, v42, 33 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 8 -; SI-NEXT: v_readlane_b32 s13, v41, 59 +; SI-NEXT: v_readlane_b32 s13, v42, 34 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: v_readlane_b32 s12, v41, 60 -; SI-NEXT: v_readlane_b32 s13, v41, 61 +; SI-NEXT: v_readlane_b32 s12, v42, 35 +; SI-NEXT: v_readlane_b32 s13, v42, 36 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s14, v41, 62 +; SI-NEXT: v_readlane_b32 s14, v42, 37 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_lshl_b32 s13, s14, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff @@ -215395,12 +214883,12 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v43, 56 +; SI-NEXT: v_readlane_b32 s11, v43, 9 ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_readlane_b32 s11, v43, 55 +; SI-NEXT: v_readlane_b32 s11, v43, 8 ; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: v_readlane_b32 s12, v43, 54 +; SI-NEXT: v_readlane_b32 s12, v43, 7 ; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_lshl_b32 s12, s12, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff @@ -215410,15 +214898,15 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_readlane_b32 s10, v43, 0 +; SI-NEXT: v_readlane_b32 s10, v42, 39 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 8 -; SI-NEXT: v_readlane_b32 s11, v43, 1 +; SI-NEXT: v_readlane_b32 s11, v42, 40 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: v_readlane_b32 s10, v43, 2 -; SI-NEXT: v_readlane_b32 s11, v43, 3 +; SI-NEXT: v_readlane_b32 s10, v42, 41 +; SI-NEXT: v_readlane_b32 s11, v42, 42 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s12, v43, 4 +; SI-NEXT: v_readlane_b32 s12, v42, 43 ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_lshl_b32 s11, s12, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff @@ -215429,12 +214917,12 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s8, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 59 +; SI-NEXT: v_readlane_b32 s9, v43, 12 ; SI-NEXT: s_lshl_b32 s9, s9, 8 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_readlane_b32 s9, v43, 58 +; SI-NEXT: v_readlane_b32 s9, v43, 11 ; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: v_readlane_b32 s10, v43, 57 +; SI-NEXT: v_readlane_b32 s10, v43, 10 ; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_lshl_b32 s10, s10, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff @@ -215444,15 +214932,15 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_readlane_b32 s8, v43, 6 +; SI-NEXT: v_readlane_b32 s8, v42, 45 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 7 +; SI-NEXT: v_readlane_b32 s9, v42, 46 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: v_readlane_b32 s8, v43, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 9 +; SI-NEXT: v_readlane_b32 s8, v42, 47 +; SI-NEXT: v_readlane_b32 s9, v42, 48 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s10, v43, 10 +; SI-NEXT: v_readlane_b32 s10, v42, 49 ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_lshl_b32 s9, s10, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff @@ -215463,69 +214951,89 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 62 +; SI-NEXT: v_readlane_b32 s7, v43, 15 ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_readlane_b32 s7, v43, 61 +; SI-NEXT: v_readlane_b32 s7, v43, 14 ; SI-NEXT: s_and_b32 s7, s7, 0xff -; SI-NEXT: v_readlane_b32 s8, v43, 60 +; SI-NEXT: v_readlane_b32 s8, v43, 13 +; SI-NEXT: v_readlane_b32 s19, v41, 50 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: v_readlane_b32 s19, v41, 52 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_readlane_b32 s19, v41, 54 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_readlane_b32 s19, v41, 56 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_readlane_b32 s6, v43, 12 +; SI-NEXT: v_readlane_b32 s6, v42, 51 +; SI-NEXT: v_readlane_b32 s19, v41, 58 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 13 +; SI-NEXT: v_readlane_b32 s19, v41, 60 +; SI-NEXT: v_readlane_b32 s7, v42, 52 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: v_readlane_b32 s6, v43, 14 -; SI-NEXT: v_readlane_b32 s7, v43, 15 +; SI-NEXT: v_readlane_b32 s6, v42, 53 +; SI-NEXT: v_readlane_b32 s19, v41, 62 +; SI-NEXT: v_readlane_b32 s7, v42, 54 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s8, v43, 16 +; SI-NEXT: v_readlane_b32 s8, v42, 55 +; SI-NEXT: v_readlane_b32 s19, v42, 0 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s8, 24 +; SI-NEXT: v_readlane_b32 s19, v42, 2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readlane_b32 s19, v42, 4 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: v_readlane_b32 s23, v41, 26 +; SI-NEXT: v_readlane_b32 s19, v42, 6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: v_readlane_b32 s5, v42, 1 +; SI-NEXT: v_readlane_b32 s5, v43, 18 +; SI-NEXT: v_readlane_b32 s23, v41, 28 +; SI-NEXT: v_readlane_b32 s19, v42, 8 ; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_readlane_b32 s23, v41, 30 +; SI-NEXT: v_readlane_b32 s19, v42, 10 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v42, 0 -; SI-NEXT: v_readlane_b32 s19, v41, 37 +; SI-NEXT: v_readlane_b32 s5, v43, 17 +; SI-NEXT: v_readlane_b32 s23, v41, 32 +; SI-NEXT: v_readlane_b32 s19, v42, 12 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_readlane_b32 s6, v43, 63 -; SI-NEXT: v_readlane_b32 s19, v41, 39 +; SI-NEXT: v_readlane_b32 s6, v43, 16 +; SI-NEXT: v_readlane_b32 s23, v41, 34 +; SI-NEXT: v_readlane_b32 s19, v42, 14 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 24 -; SI-NEXT: v_readlane_b32 s19, v41, 41 +; SI-NEXT: v_readlane_b32 s23, v41, 36 +; SI-NEXT: v_readlane_b32 s19, v42, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readlane_b32 s21, v41, 23 -; SI-NEXT: v_readlane_b32 s19, v41, 43 +; SI-NEXT: v_readlane_b32 s23, v41, 38 +; SI-NEXT: v_readlane_b32 s19, v42, 18 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s21, v41, 29 -; SI-NEXT: v_readlane_b32 s19, v41, 45 +; SI-NEXT: v_readlane_b32 s23, v41, 40 +; SI-NEXT: v_readlane_b32 s19, v42, 20 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s21, v41, 33 -; SI-NEXT: v_readlane_b32 s19, v41, 51 -; SI-NEXT: v_readlane_b32 s17, v41, 57 -; SI-NEXT: v_readlane_b32 s15, v41, 63 -; SI-NEXT: v_readlane_b32 s13, v43, 5 -; SI-NEXT: v_readlane_b32 s11, v43, 11 -; SI-NEXT: v_readlane_b32 s9, v43, 17 +; SI-NEXT: v_readlane_b32 s23, v41, 42 +; SI-NEXT: v_readlane_b32 s21, v41, 48 +; SI-NEXT: v_readlane_b32 s19, v42, 26 +; SI-NEXT: v_readlane_b32 s17, v42, 32 +; SI-NEXT: v_readlane_b32 s15, v42, 38 +; SI-NEXT: v_readlane_b32 s13, v42, 44 +; SI-NEXT: v_readlane_b32 s11, v42, 50 +; SI-NEXT: v_readlane_b32 s9, v42, 56 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s99, v40, 35 ; SI-NEXT: v_readlane_b32 s98, v40, 34 @@ -215573,228 +215081,245 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 20 -; SI-NEXT: v_writelane_b32 v41, s5, 21 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 22 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s5, 23 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 25 +; SI-NEXT: v_writelane_b32 v41, s5, 26 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 27 +; SI-NEXT: v_writelane_b32 v41, s5, 28 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 24 -; SI-NEXT: v_writelane_b32 v41, s5, 25 +; SI-NEXT: v_writelane_b32 v41, s4, 29 +; SI-NEXT: v_writelane_b32 v41, s5, 30 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 26 -; SI-NEXT: v_writelane_b32 v41, s5, 27 +; SI-NEXT: v_writelane_b32 v41, s4, 31 +; SI-NEXT: v_writelane_b32 v41, s5, 32 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 28 +; SI-NEXT: v_writelane_b32 v41, s4, 33 +; SI-NEXT: v_writelane_b32 v41, s5, 34 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 35 +; SI-NEXT: v_writelane_b32 v41, s5, 36 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 37 +; SI-NEXT: v_writelane_b32 v41, s5, 38 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 39 +; SI-NEXT: v_writelane_b32 v41, s5, 40 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 41 +; SI-NEXT: v_writelane_b32 v41, s5, 42 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 43 +; SI-NEXT: v_writelane_b32 v41, s5, 44 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 45 +; SI-NEXT: v_writelane_b32 v41, s5, 46 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 47 +; SI-NEXT: v_writelane_b32 v41, s5, 48 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 49 +; SI-NEXT: v_writelane_b32 v41, s5, 50 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 51 +; SI-NEXT: v_writelane_b32 v41, s5, 52 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 53 +; SI-NEXT: v_writelane_b32 v41, s5, 54 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 55 +; SI-NEXT: v_writelane_b32 v41, s5, 56 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 57 +; SI-NEXT: v_writelane_b32 v41, s5, 58 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 59 +; SI-NEXT: v_writelane_b32 v41, s5, 60 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 61 +; SI-NEXT: v_writelane_b32 v41, s5, 62 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v42, s5, 0 +; SI-NEXT: v_writelane_b32 v41, s4, 63 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 1 +; SI-NEXT: v_writelane_b32 v42, s5, 2 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 3 +; SI-NEXT: v_writelane_b32 v42, s5, 4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 5 +; SI-NEXT: v_writelane_b32 v42, s5, 6 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 7 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s5, 8 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 9 +; SI-NEXT: v_writelane_b32 v42, s5, 10 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 11 +; SI-NEXT: v_writelane_b32 v42, s5, 12 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 13 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s5, 14 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 15 +; SI-NEXT: v_writelane_b32 v42, s5, 16 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 17 +; SI-NEXT: v_writelane_b32 v42, s5, 18 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 19 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s5, 20 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 21 +; SI-NEXT: v_writelane_b32 v42, s5, 22 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 23 +; SI-NEXT: v_writelane_b32 v42, s5, 24 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 25 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s5, 26 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 27 +; SI-NEXT: v_writelane_b32 v42, s5, 28 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 29 +; SI-NEXT: v_writelane_b32 v42, s5, 30 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 31 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s5, 29 +; SI-NEXT: v_writelane_b32 v42, s5, 32 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 30 -; SI-NEXT: v_writelane_b32 v41, s5, 31 +; SI-NEXT: v_writelane_b32 v42, s4, 33 +; SI-NEXT: v_writelane_b32 v42, s5, 34 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 32 -; SI-NEXT: v_writelane_b32 v41, s5, 33 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 34 -; SI-NEXT: v_writelane_b32 v41, s5, 35 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 36 -; SI-NEXT: v_writelane_b32 v41, s5, 37 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 38 -; SI-NEXT: v_writelane_b32 v41, s5, 39 +; SI-NEXT: v_writelane_b32 v42, s4, 35 +; SI-NEXT: v_writelane_b32 v42, s5, 36 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 40 -; SI-NEXT: v_writelane_b32 v41, s5, 41 +; SI-NEXT: v_writelane_b32 v42, s4, 37 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 42 -; SI-NEXT: v_writelane_b32 v41, s5, 43 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 44 -; SI-NEXT: v_writelane_b32 v41, s5, 45 +; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 46 -; SI-NEXT: v_writelane_b32 v41, s5, 47 +; SI-NEXT: v_writelane_b32 v42, s5, 38 +; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 48 -; SI-NEXT: v_writelane_b32 v41, s5, 49 +; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 50 -; SI-NEXT: v_writelane_b32 v41, s5, 51 +; SI-NEXT: v_writelane_b32 v42, s4, 39 +; SI-NEXT: v_writelane_b32 v42, s5, 40 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 52 -; SI-NEXT: v_writelane_b32 v41, s5, 53 +; SI-NEXT: v_writelane_b32 v42, s4, 41 +; SI-NEXT: v_writelane_b32 v42, s5, 42 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 54 -; SI-NEXT: v_writelane_b32 v41, s5, 55 +; SI-NEXT: v_writelane_b32 v42, s4, 43 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 56 -; SI-NEXT: v_writelane_b32 v41, s5, 57 +; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 58 -; SI-NEXT: v_writelane_b32 v41, s5, 59 +; SI-NEXT: v_writelane_b32 v42, s5, 44 +; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 60 -; SI-NEXT: v_writelane_b32 v41, s5, 61 +; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s4, 62 +; SI-NEXT: v_writelane_b32 v42, s4, 45 +; SI-NEXT: v_writelane_b32 v42, s5, 46 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v43, s4, 0 -; SI-NEXT: v_writelane_b32 v43, s5, 1 +; SI-NEXT: v_writelane_b32 v42, s4, 47 +; SI-NEXT: v_writelane_b32 v42, s5, 48 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v43, s4, 2 -; SI-NEXT: v_writelane_b32 v43, s5, 3 +; SI-NEXT: v_writelane_b32 v42, s4, 49 +; SI-NEXT: v_writelane_b32 v42, s5, 50 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v43, s4, 4 -; SI-NEXT: v_writelane_b32 v43, s5, 5 +; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v43, s4, 6 -; SI-NEXT: v_writelane_b32 v43, s5, 7 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: v_writelane_b32 v42, s18, 51 +; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v43, s4, 8 -; SI-NEXT: v_writelane_b32 v43, s5, 9 +; SI-NEXT: v_writelane_b32 v42, s19, 52 +; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v43, s4, 10 -; SI-NEXT: v_writelane_b32 v43, s5, 11 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: v_writelane_b32 v43, s16, 12 -; SI-NEXT: v_writelane_b32 v43, s17, 13 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: v_writelane_b32 v43, s16, 14 -; SI-NEXT: v_writelane_b32 v43, s17, 15 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: v_writelane_b32 v43, s16, 16 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: v_writelane_b32 v42, s18, 53 +; SI-NEXT: v_writelane_b32 v42, s19, 54 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: s_mov_b32 s94, s90 +; SI-NEXT: s_mov_b32 s30, s79 +; SI-NEXT: s_mov_b32 s31, s7 +; SI-NEXT: s_mov_b32 s79, s78 +; SI-NEXT: s_mov_b32 s90, s76 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: v_writelane_b32 v42, s18, 55 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr81 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: v_writelane_b32 v41, s5, 63 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v43, s17, 17 +; SI-NEXT: v_writelane_b32 v42, s19, 56 +; SI-NEXT: ; kill: killed $sgpr5 +; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; kill: killed $sgpr5 ; SI-NEXT: s_branch .LBB99_2 ; ; VI-LABEL: bitcast_v64i16_to_v128i8_scalar: @@ -215888,159 +215413,161 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_cbranch_scc0 .LBB99_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b32 s46, s19, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 0 -; VI-NEXT: s_lshr_b32 s46, s19, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 1 -; VI-NEXT: s_lshr_b32 s46, s19, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 2 -; VI-NEXT: s_lshr_b32 s46, s18, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 3 -; VI-NEXT: s_lshr_b32 s46, s18, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 4 -; VI-NEXT: s_lshr_b32 s46, s21, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 5 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 6 -; VI-NEXT: s_lshr_b32 s46, s21, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 7 -; VI-NEXT: s_lshr_b32 s46, s20, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s20, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 9 -; VI-NEXT: s_lshr_b32 s46, s23, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s23, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s23, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s22, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s22, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 14 -; VI-NEXT: s_lshr_b32 s46, s25, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 15 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 16 -; VI-NEXT: s_lshr_b32 s46, s25, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 17 -; VI-NEXT: s_lshr_b32 s46, s24, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 18 -; VI-NEXT: s_lshr_b32 s46, s24, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s27, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s27, 16 +; VI-NEXT: s_lshr_b32 s46, s21, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: s_lshr_b32 s46, s20, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s26, 16 +; VI-NEXT: s_lshr_b32 s46, s20, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s26, 8 +; VI-NEXT: s_lshr_b32 s46, s23, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: s_lshr_b32 s46, s23, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 25 -; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: s_lshr_b32 s46, s23, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 26 -; VI-NEXT: s_lshr_b32 s46, s5, 16 +; VI-NEXT: s_lshr_b32 s46, s22, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 27 -; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: s_lshr_b32 s46, s22, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 28 -; VI-NEXT: s_lshr_b32 s46, s4, 16 +; VI-NEXT: s_lshr_b32 s46, s25, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 29 -; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: s_lshr_b32 s46, s25, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 30 -; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: s_lshr_b32 s46, s25, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 31 -; VI-NEXT: s_lshr_b32 s46, s7, 16 +; VI-NEXT: s_lshr_b32 s46, s24, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 32 -; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: s_lshr_b32 s46, s24, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 33 -; VI-NEXT: s_lshr_b32 s46, s6, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 34 -; VI-NEXT: s_lshr_b32 s46, s6, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 35 -; VI-NEXT: s_lshr_b32 s46, s9, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 36 -; VI-NEXT: s_lshr_b32 s46, s9, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 37 -; VI-NEXT: s_lshr_b32 s46, s9, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 38 -; VI-NEXT: s_lshr_b32 s46, s8, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 39 -; VI-NEXT: s_lshr_b32 s46, s8, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 40 -; VI-NEXT: s_lshr_b32 s46, s11, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 41 -; VI-NEXT: s_lshr_b32 s46, s11, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 42 -; VI-NEXT: s_lshr_b32 s46, s11, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 43 -; VI-NEXT: s_lshr_b32 s46, s10, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 44 -; VI-NEXT: s_lshr_b32 s46, s10, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 45 -; VI-NEXT: s_lshr_b32 s46, s13, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 46 -; VI-NEXT: s_lshr_b32 s46, s13, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 47 -; VI-NEXT: s_lshr_b32 s46, s13, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 48 -; VI-NEXT: s_lshr_b32 s46, s12, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 49 -; VI-NEXT: s_lshr_b32 s46, s12, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 50 -; VI-NEXT: s_lshr_b32 s46, s15, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 51 -; VI-NEXT: s_lshr_b32 s46, s15, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 52 -; VI-NEXT: s_lshr_b32 s46, s15, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 53 -; VI-NEXT: s_lshr_b32 s46, s14, 16 -; VI-NEXT: v_writelane_b32 v21, s46, 54 -; VI-NEXT: s_lshr_b32 s46, s14, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s17, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 56 -; VI-NEXT: s_lshr_b32 s46, s17, 16 -; VI-NEXT: s_lshr_b32 s80, s29, 16 -; VI-NEXT: s_lshr_b32 s82, s29, 8 -; VI-NEXT: s_lshr_b32 s84, s28, 16 -; VI-NEXT: s_lshr_b32 s86, s28, 8 -; VI-NEXT: s_lshr_b32 s51, s43, 24 -; VI-NEXT: s_lshr_b32 s53, s43, 16 -; VI-NEXT: s_lshr_b32 s54, s43, 8 -; VI-NEXT: s_lshr_b32 s65, s42, 16 -; VI-NEXT: s_lshr_b32 s66, s42, 8 -; VI-NEXT: s_lshr_b32 s67, s45, 24 -; VI-NEXT: s_lshr_b32 s68, s45, 16 -; VI-NEXT: s_lshr_b32 s69, s45, 8 -; VI-NEXT: s_lshr_b32 s70, s44, 16 -; VI-NEXT: s_lshr_b32 s71, s44, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 57 -; VI-NEXT: s_lshr_b32 s81, s17, 8 -; VI-NEXT: s_lshr_b32 s83, s16, 16 -; VI-NEXT: s_lshr_b32 s85, s16, 8 -; VI-NEXT: s_lshr_b32 s87, s41, 24 -; VI-NEXT: s_lshr_b32 s50, s41, 16 -; VI-NEXT: s_lshr_b32 s52, s41, 8 -; VI-NEXT: s_lshr_b32 s55, s40, 16 -; VI-NEXT: s_lshr_b32 s64, s40, 8 -; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[22:23], 24 +; VI-NEXT: s_lshr_b32 s47, s4, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 34 +; VI-NEXT: s_lshr_b32 s47, s4, 8 +; VI-NEXT: v_writelane_b32 v21, s47, 35 +; VI-NEXT: s_lshr_b32 s47, s7, 24 +; VI-NEXT: v_writelane_b32 v21, s47, 36 +; VI-NEXT: s_lshr_b32 s47, s7, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 37 +; VI-NEXT: s_lshr_b32 s47, s7, 8 +; VI-NEXT: v_writelane_b32 v21, s47, 38 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 39 +; VI-NEXT: s_lshr_b32 s47, s6, 8 +; VI-NEXT: v_writelane_b32 v21, s47, 40 +; VI-NEXT: s_lshr_b32 s47, s9, 24 +; VI-NEXT: v_writelane_b32 v21, s47, 41 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 42 +; VI-NEXT: s_lshr_b32 s47, s9, 8 +; VI-NEXT: v_writelane_b32 v21, s47, 43 +; VI-NEXT: s_lshr_b32 s47, s8, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 44 +; VI-NEXT: s_lshr_b32 s47, s8, 8 +; VI-NEXT: v_writelane_b32 v21, s47, 45 +; VI-NEXT: s_lshr_b32 s47, s11, 24 +; VI-NEXT: v_writelane_b32 v21, s47, 46 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 47 +; VI-NEXT: s_lshr_b32 s47, s11, 8 +; VI-NEXT: v_writelane_b32 v21, s47, 48 +; VI-NEXT: s_lshr_b32 s47, s10, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 49 +; VI-NEXT: s_lshr_b32 s47, s10, 8 +; VI-NEXT: v_writelane_b32 v21, s47, 50 +; VI-NEXT: s_lshr_b32 s47, s13, 24 +; VI-NEXT: v_writelane_b32 v21, s47, 51 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 52 +; VI-NEXT: s_lshr_b64 s[30:31], s[18:19], 24 +; VI-NEXT: v_writelane_b32 v21, s30, 4 +; VI-NEXT: v_writelane_b32 v21, s31, 5 +; VI-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 +; VI-NEXT: v_writelane_b32 v21, s30, 2 +; VI-NEXT: v_writelane_b32 v21, s31, 3 +; VI-NEXT: s_lshr_b64 s[30:31], s[22:23], 24 +; VI-NEXT: s_lshr_b32 s48, s18, 8 +; VI-NEXT: s_lshr_b32 s80, s21, 24 +; VI-NEXT: s_lshr_b32 s90, s5, 8 +; VI-NEXT: v_writelane_b32 v21, s30, 0 +; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 +; VI-NEXT: s_lshr_b32 s89, s19, 16 +; VI-NEXT: s_lshr_b32 s88, s21, 16 +; VI-NEXT: v_writelane_b32 v21, s31, 1 ; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; VI-NEXT: s_mov_b32 s39, s48 +; VI-NEXT: s_mov_b32 s48, s80 +; VI-NEXT: s_mov_b32 s80, s90 +; VI-NEXT: s_lshr_b64 s[90:91], s[44:45], 24 +; VI-NEXT: s_mov_b32 s31, s89 +; VI-NEXT: s_mov_b32 s91, s88 +; VI-NEXT: s_lshr_b64 s[88:89], s[4:5], 24 +; VI-NEXT: v_writelane_b32 v21, s88, 18 +; VI-NEXT: v_writelane_b32 v21, s89, 19 +; VI-NEXT: s_lshr_b64 s[88:89], s[6:7], 24 +; VI-NEXT: v_writelane_b32 v21, s88, 16 +; VI-NEXT: v_writelane_b32 v21, s89, 17 +; VI-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 +; VI-NEXT: v_writelane_b32 v21, s88, 14 +; VI-NEXT: v_writelane_b32 v21, s89, 15 +; VI-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 +; VI-NEXT: v_writelane_b32 v21, s88, 12 +; VI-NEXT: v_writelane_b32 v21, s89, 13 +; VI-NEXT: s_lshr_b64 s[88:89], s[12:13], 24 +; VI-NEXT: v_writelane_b32 v21, s88, 10 +; VI-NEXT: v_writelane_b32 v21, s89, 11 +; VI-NEXT: s_lshr_b64 s[88:89], s[14:15], 24 +; VI-NEXT: v_writelane_b32 v21, s88, 8 +; VI-NEXT: v_writelane_b32 v21, s89, 9 +; VI-NEXT: s_lshr_b64 s[88:89], s[16:17], 24 +; VI-NEXT: s_lshr_b32 vcc_lo, s19, 8 +; VI-NEXT: s_lshr_b32 vcc_hi, s18, 16 ; VI-NEXT: s_lshr_b64 s[34:35], s[26:27], 24 ; VI-NEXT: s_lshr_b64 s[36:37], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 -; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 -; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[40:41], 24 +; VI-NEXT: v_writelane_b32 v21, s88, 6 +; VI-NEXT: s_lshr_b32 s71, s27, 24 +; VI-NEXT: s_lshr_b32 s83, s27, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 8 +; VI-NEXT: s_lshr_b32 s86, s26, 16 +; VI-NEXT: s_lshr_b32 s63, s26, 8 +; VI-NEXT: s_lshr_b32 s72, s29, 24 +; VI-NEXT: s_lshr_b32 s73, s29, 16 +; VI-NEXT: s_lshr_b32 s51, s29, 8 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: s_lshr_b32 s74, s28, 8 +; VI-NEXT: s_lshr_b32 s54, s43, 24 +; VI-NEXT: s_lshr_b32 s64, s43, 16 +; VI-NEXT: s_lshr_b32 s65, s43, 8 +; VI-NEXT: s_lshr_b32 s77, s42, 16 +; VI-NEXT: s_lshr_b32 s79, s42, 8 +; VI-NEXT: s_lshr_b32 s56, s45, 24 +; VI-NEXT: s_lshr_b32 s68, s45, 16 +; VI-NEXT: s_lshr_b32 s69, s45, 8 +; VI-NEXT: s_lshr_b32 s70, s44, 16 +; VI-NEXT: s_lshr_b32 s58, s44, 8 +; VI-NEXT: s_lshr_b32 s49, s5, 24 +; VI-NEXT: s_lshr_b32 s81, s5, 16 +; VI-NEXT: s_lshr_b32 s59, s13, 8 +; VI-NEXT: s_lshr_b32 s82, s12, 16 +; VI-NEXT: s_lshr_b32 s84, s12, 8 +; VI-NEXT: s_lshr_b32 s61, s15, 24 +; VI-NEXT: s_lshr_b32 s85, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s15, 8 +; VI-NEXT: s_lshr_b32 s87, s14, 16 +; VI-NEXT: s_lshr_b32 s50, s14, 8 +; VI-NEXT: s_lshr_b32 s52, s17, 24 +; VI-NEXT: s_lshr_b32 s53, s17, 16 +; VI-NEXT: s_lshr_b32 s47, s17, 8 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s55, s16, 8 +; VI-NEXT: s_lshr_b32 s66, s41, 24 +; VI-NEXT: s_lshr_b32 s76, s41, 16 +; VI-NEXT: s_lshr_b32 s78, s41, 8 +; VI-NEXT: s_lshr_b32 s67, s40, 16 +; VI-NEXT: s_lshr_b32 s57, s40, 8 +; VI-NEXT: s_mov_b32 s35, vcc_lo +; VI-NEXT: s_mov_b32 s37, vcc_hi +; VI-NEXT: v_writelane_b32 v21, s89, 7 +; VI-NEXT: s_lshr_b64 s[88:89], s[40:41], 24 ; VI-NEXT: s_cbranch_execnz .LBB99_3 ; VI-NEXT: .LBB99_2: ; %cmp.true ; VI-NEXT: s_and_b32 s46, s41, 0xffff0000 @@ -216169,299 +215696,288 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s19, s46, s19 ; VI-NEXT: s_and_b32 s46, s18, 0xffff0000 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_add_i32 s19, s19, 0x30000 ; VI-NEXT: s_and_b32 s18, s18, 0xffff ; VI-NEXT: s_or_b32 s18, s46, s18 -; VI-NEXT: s_lshr_b32 s46, s19, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 0 -; VI-NEXT: s_lshr_b32 s46, s19, 16 +; VI-NEXT: s_add_i32 s19, s19, 0x30000 ; VI-NEXT: s_add_i32 s18, s18, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 1 -; VI-NEXT: s_lshr_b32 s46, s19, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 2 -; VI-NEXT: s_lshr_b32 s46, s18, 16 +; VI-NEXT: s_lshr_b64 s[46:47], s[18:19], 24 ; VI-NEXT: s_add_i32 s21, s21, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 3 -; VI-NEXT: s_lshr_b32 s46, s18, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 4 -; VI-NEXT: s_lshr_b32 s46, s21, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 5 -; VI-NEXT: s_lshr_b32 s46, s21, 16 ; VI-NEXT: s_add_i32 s20, s20, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 6 -; VI-NEXT: s_lshr_b32 s46, s21, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 7 -; VI-NEXT: s_lshr_b32 s46, s20, 16 +; VI-NEXT: v_writelane_b32 v21, s46, 4 +; VI-NEXT: v_writelane_b32 v21, s47, 5 +; VI-NEXT: s_lshr_b64 s[46:47], s[20:21], 24 ; VI-NEXT: s_add_i32 s23, s23, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s20, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 9 -; VI-NEXT: s_lshr_b32 s46, s23, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s23, 16 ; VI-NEXT: s_add_i32 s22, s22, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s23, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s22, 16 -; VI-NEXT: s_add_i32 s25, s25, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s22, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 14 -; VI-NEXT: s_lshr_b32 s46, s25, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 15 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: s_add_i32 s24, s24, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 16 -; VI-NEXT: s_lshr_b32 s46, s25, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 17 -; VI-NEXT: s_lshr_b32 s46, s24, 16 -; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 2 +; VI-NEXT: v_writelane_b32 v21, s47, 3 +; VI-NEXT: s_lshr_b64 s[46:47], s[22:23], 24 +; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 0 +; VI-NEXT: v_writelane_b32 v21, s47, 1 +; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_add_i32 s6, s6, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 18 -; VI-NEXT: s_lshr_b32 s46, s24, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s27, 24 +; VI-NEXT: v_writelane_b32 v21, s47, 19 +; VI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; VI-NEXT: s_add_i32 s9, s9, 0x30000 +; VI-NEXT: s_add_i32 s8, s8, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 17 +; VI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 +; VI-NEXT: s_add_i32 s11, s11, 0x30000 +; VI-NEXT: s_add_i32 s10, s10, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 14 +; VI-NEXT: v_writelane_b32 v21, s47, 15 +; VI-NEXT: s_lshr_b64 s[46:47], s[10:11], 24 +; VI-NEXT: s_add_i32 s13, s13, 0x30000 +; VI-NEXT: s_add_i32 s12, s12, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 12 +; VI-NEXT: v_writelane_b32 v21, s47, 13 +; VI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 +; VI-NEXT: s_add_i32 s15, s15, 0x30000 +; VI-NEXT: s_add_i32 s14, s14, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 10 +; VI-NEXT: v_writelane_b32 v21, s47, 11 +; VI-NEXT: s_lshr_b64 s[46:47], s[14:15], 24 +; VI-NEXT: s_add_i32 s17, s17, 0x30000 +; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s46, 8 +; VI-NEXT: v_writelane_b32 v21, s47, 9 +; VI-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 +; VI-NEXT: v_writelane_b32 v21, s46, 6 +; VI-NEXT: v_writelane_b32 v21, s47, 7 +; VI-NEXT: s_lshr_b32 s46, s19, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s27, 16 -; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: s_lshr_b32 s46, s21, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s27, 8 +; VI-NEXT: s_lshr_b32 s46, s20, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s26, 16 -; VI-NEXT: s_add_i32 s29, s29, 0x30000 +; VI-NEXT: s_lshr_b32 s46, s20, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s26, 8 -; VI-NEXT: s_add_i32 s5, s5, 0x30000 +; VI-NEXT: s_lshr_b32 s46, s23, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s29, 24 +; VI-NEXT: s_lshr_b32 s46, s23, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 25 -; VI-NEXT: s_lshr_b32 s46, s5, 24 +; VI-NEXT: s_lshr_b32 s46, s23, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 26 -; VI-NEXT: s_lshr_b32 s46, s5, 16 -; VI-NEXT: s_add_i32 s4, s4, 0x30000 +; VI-NEXT: s_lshr_b32 s46, s22, 16 +; VI-NEXT: s_add_i32 s25, s25, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 27 -; VI-NEXT: s_lshr_b32 s46, s5, 8 +; VI-NEXT: s_lshr_b32 s46, s22, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 28 -; VI-NEXT: s_lshr_b32 s46, s4, 16 -; VI-NEXT: s_add_i32 s7, s7, 0x30000 +; VI-NEXT: s_lshr_b32 s46, s25, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 29 -; VI-NEXT: s_lshr_b32 s46, s4, 8 +; VI-NEXT: s_lshr_b32 s46, s25, 16 +; VI-NEXT: s_add_i32 s24, s24, 0x30000 ; VI-NEXT: v_writelane_b32 v21, s46, 30 -; VI-NEXT: s_lshr_b32 s46, s7, 24 +; VI-NEXT: s_lshr_b32 s46, s25, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 31 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: s_lshr_b32 s46, s24, 16 ; VI-NEXT: v_writelane_b32 v21, s46, 32 -; VI-NEXT: s_lshr_b32 s46, s7, 8 +; VI-NEXT: s_lshr_b32 s46, s24, 8 ; VI-NEXT: v_writelane_b32 v21, s46, 33 -; VI-NEXT: s_lshr_b32 s46, s6, 16 -; VI-NEXT: s_add_i32 s9, s9, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 34 -; VI-NEXT: s_lshr_b32 s46, s6, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 35 -; VI-NEXT: s_lshr_b32 s46, s9, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 36 -; VI-NEXT: s_lshr_b32 s46, s9, 16 -; VI-NEXT: s_add_i32 s8, s8, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 37 -; VI-NEXT: s_lshr_b32 s46, s9, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 38 -; VI-NEXT: s_lshr_b32 s46, s8, 16 -; VI-NEXT: s_add_i32 s11, s11, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 39 -; VI-NEXT: s_lshr_b32 s46, s8, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 40 -; VI-NEXT: s_lshr_b32 s46, s11, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 41 -; VI-NEXT: s_lshr_b32 s46, s11, 16 -; VI-NEXT: s_add_i32 s10, s10, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 42 -; VI-NEXT: s_lshr_b32 s46, s11, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 43 -; VI-NEXT: s_lshr_b32 s46, s10, 16 -; VI-NEXT: s_add_i32 s13, s13, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 44 -; VI-NEXT: s_lshr_b32 s46, s10, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 45 -; VI-NEXT: s_lshr_b32 s46, s13, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 46 -; VI-NEXT: s_lshr_b32 s46, s13, 16 -; VI-NEXT: s_add_i32 s12, s12, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 47 -; VI-NEXT: s_lshr_b32 s46, s13, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 48 -; VI-NEXT: s_lshr_b32 s46, s12, 16 -; VI-NEXT: s_add_i32 s15, s15, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 49 -; VI-NEXT: s_lshr_b32 s46, s12, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 50 -; VI-NEXT: s_lshr_b32 s46, s15, 24 -; VI-NEXT: v_writelane_b32 v21, s46, 51 -; VI-NEXT: s_lshr_b32 s46, s15, 16 -; VI-NEXT: s_add_i32 s14, s14, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 52 -; VI-NEXT: s_lshr_b32 s46, s15, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 53 -; VI-NEXT: s_lshr_b32 s46, s14, 16 -; VI-NEXT: s_add_i32 s17, s17, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 54 -; VI-NEXT: s_lshr_b32 s46, s14, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s17, 24 -; VI-NEXT: s_add_i32 s41, s41, 0x30000 -; VI-NEXT: s_add_i32 s40, s40, 0x30000 -; VI-NEXT: s_add_i32 s16, s16, 0x30000 +; VI-NEXT: s_lshr_b32 s47, s4, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 34 +; VI-NEXT: s_lshr_b32 s47, s4, 8 +; VI-NEXT: v_writelane_b32 v21, s47, 35 +; VI-NEXT: s_lshr_b32 s47, s7, 24 +; VI-NEXT: v_writelane_b32 v21, s47, 36 +; VI-NEXT: s_lshr_b32 s47, s7, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 37 +; VI-NEXT: s_lshr_b32 s47, s7, 8 +; VI-NEXT: v_writelane_b32 v21, s47, 38 +; VI-NEXT: s_lshr_b32 s47, s6, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 39 +; VI-NEXT: s_lshr_b32 s47, s6, 8 +; VI-NEXT: v_writelane_b32 v21, s47, 40 +; VI-NEXT: s_lshr_b32 s47, s9, 24 +; VI-NEXT: v_writelane_b32 v21, s47, 41 +; VI-NEXT: s_lshr_b32 s47, s9, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 42 +; VI-NEXT: s_lshr_b32 s47, s9, 8 +; VI-NEXT: v_writelane_b32 v21, s47, 43 +; VI-NEXT: s_lshr_b32 s47, s8, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 44 +; VI-NEXT: s_lshr_b32 s47, s8, 8 +; VI-NEXT: v_writelane_b32 v21, s47, 45 +; VI-NEXT: s_lshr_b32 s47, s11, 24 +; VI-NEXT: v_writelane_b32 v21, s47, 46 +; VI-NEXT: s_lshr_b32 s47, s11, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 47 +; VI-NEXT: s_lshr_b32 s47, s11, 8 +; VI-NEXT: v_writelane_b32 v21, s47, 48 +; VI-NEXT: s_lshr_b32 s47, s10, 16 +; VI-NEXT: v_writelane_b32 v21, s47, 49 +; VI-NEXT: s_lshr_b32 s47, s10, 8 ; VI-NEXT: s_add_i32 s45, s45, 0x30000 ; VI-NEXT: s_add_i32 s44, s44, 0x30000 ; VI-NEXT: s_add_i32 s43, s43, 0x30000 ; VI-NEXT: s_add_i32 s42, s42, 0x30000 +; VI-NEXT: s_add_i32 s29, s29, 0x30000 ; VI-NEXT: s_add_i32 s28, s28, 0x30000 -; VI-NEXT: v_writelane_b32 v21, s46, 56 -; VI-NEXT: s_lshr_b32 s46, s17, 16 -; VI-NEXT: s_lshr_b32 s80, s29, 16 -; VI-NEXT: s_lshr_b32 s82, s29, 8 -; VI-NEXT: s_lshr_b32 s84, s28, 16 -; VI-NEXT: s_lshr_b32 s86, s28, 8 -; VI-NEXT: s_lshr_b32 s51, s43, 24 -; VI-NEXT: s_lshr_b32 s53, s43, 16 -; VI-NEXT: s_lshr_b32 s54, s43, 8 -; VI-NEXT: s_lshr_b32 s65, s42, 16 -; VI-NEXT: s_lshr_b32 s66, s42, 8 -; VI-NEXT: s_lshr_b32 s67, s45, 24 -; VI-NEXT: s_lshr_b32 s68, s45, 16 -; VI-NEXT: s_lshr_b32 s69, s45, 8 -; VI-NEXT: s_lshr_b32 s70, s44, 16 -; VI-NEXT: s_lshr_b32 s71, s44, 8 -; VI-NEXT: v_writelane_b32 v21, s46, 57 -; VI-NEXT: s_lshr_b32 s81, s17, 8 -; VI-NEXT: s_lshr_b32 s83, s16, 16 -; VI-NEXT: s_lshr_b32 s85, s16, 8 -; VI-NEXT: s_lshr_b32 s87, s41, 24 -; VI-NEXT: s_lshr_b32 s50, s41, 16 -; VI-NEXT: s_lshr_b32 s52, s41, 8 -; VI-NEXT: s_lshr_b32 s55, s40, 16 -; VI-NEXT: s_lshr_b32 s64, s40, 8 -; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[22:23], 24 +; VI-NEXT: s_add_i32 s27, s27, 0x30000 +; VI-NEXT: s_add_i32 s26, s26, 0x30000 +; VI-NEXT: v_writelane_b32 v21, s47, 50 +; VI-NEXT: s_lshr_b32 s47, s13, 24 +; VI-NEXT: s_add_i32 s41, s41, 0x30000 +; VI-NEXT: s_add_i32 s40, s40, 0x30000 ; VI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 ; VI-NEXT: s_lshr_b64 s[34:35], s[26:27], 24 ; VI-NEXT: s_lshr_b64 s[36:37], s[28:29], 24 ; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24 -; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[56:57], s[6:7], 24 -; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 -; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[40:41], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[44:45], 24 +; VI-NEXT: v_writelane_b32 v21, s47, 51 +; VI-NEXT: s_lshr_b32 s47, s13, 16 +; VI-NEXT: s_lshr_b64 s[88:89], s[40:41], 24 +; VI-NEXT: s_lshr_b32 s31, s19, 16 +; VI-NEXT: s_lshr_b32 s35, s19, 8 +; VI-NEXT: s_lshr_b32 s37, s18, 16 +; VI-NEXT: s_lshr_b32 s39, s18, 8 +; VI-NEXT: s_lshr_b32 s48, s21, 24 +; VI-NEXT: s_lshr_b32 s91, s21, 16 +; VI-NEXT: s_lshr_b32 s71, s27, 24 +; VI-NEXT: s_lshr_b32 s83, s27, 16 +; VI-NEXT: s_lshr_b32 s60, s27, 8 +; VI-NEXT: s_lshr_b32 s86, s26, 16 +; VI-NEXT: s_lshr_b32 s63, s26, 8 +; VI-NEXT: s_lshr_b32 s72, s29, 24 +; VI-NEXT: s_lshr_b32 s73, s29, 16 +; VI-NEXT: s_lshr_b32 s51, s29, 8 +; VI-NEXT: s_lshr_b32 s46, s28, 16 +; VI-NEXT: s_lshr_b32 s74, s28, 8 +; VI-NEXT: s_lshr_b32 s54, s43, 24 +; VI-NEXT: s_lshr_b32 s64, s43, 16 +; VI-NEXT: s_lshr_b32 s65, s43, 8 +; VI-NEXT: s_lshr_b32 s77, s42, 16 +; VI-NEXT: s_lshr_b32 s79, s42, 8 +; VI-NEXT: s_lshr_b32 s56, s45, 24 +; VI-NEXT: s_lshr_b32 s68, s45, 16 +; VI-NEXT: s_lshr_b32 s69, s45, 8 +; VI-NEXT: s_lshr_b32 s70, s44, 16 +; VI-NEXT: s_lshr_b32 s58, s44, 8 +; VI-NEXT: s_lshr_b32 s49, s5, 24 +; VI-NEXT: s_lshr_b32 s81, s5, 16 +; VI-NEXT: s_lshr_b32 s80, s5, 8 +; VI-NEXT: v_writelane_b32 v21, s47, 52 +; VI-NEXT: s_lshr_b32 s59, s13, 8 +; VI-NEXT: s_lshr_b32 s82, s12, 16 +; VI-NEXT: s_lshr_b32 s84, s12, 8 +; VI-NEXT: s_lshr_b32 s61, s15, 24 +; VI-NEXT: s_lshr_b32 s85, s15, 16 +; VI-NEXT: s_lshr_b32 s62, s15, 8 +; VI-NEXT: s_lshr_b32 s87, s14, 16 +; VI-NEXT: s_lshr_b32 s50, s14, 8 +; VI-NEXT: s_lshr_b32 s52, s17, 24 +; VI-NEXT: s_lshr_b32 s53, s17, 16 +; VI-NEXT: s_lshr_b32 s47, s17, 8 +; VI-NEXT: s_lshr_b32 s75, s16, 16 +; VI-NEXT: s_lshr_b32 s55, s16, 8 +; VI-NEXT: s_lshr_b32 s66, s41, 24 +; VI-NEXT: s_lshr_b32 s76, s41, 16 +; VI-NEXT: s_lshr_b32 s78, s41, 8 +; VI-NEXT: s_lshr_b32 s67, s40, 16 +; VI-NEXT: s_lshr_b32 s57, s40, 8 ; VI-NEXT: .LBB99_3: ; %end -; VI-NEXT: s_lshl_b32 s47, s71, 8 ; VI-NEXT: s_and_b32 s44, s44, 0xff -; VI-NEXT: s_or_b32 s44, s44, s47 -; VI-NEXT: s_lshl_b32 s47, s48, 8 -; VI-NEXT: s_and_b32 s57, s70, 0xff -; VI-NEXT: s_or_b32 s47, s57, s47 +; VI-NEXT: s_lshl_b32 s58, s58, 8 +; VI-NEXT: s_or_b32 s44, s44, s58 +; VI-NEXT: s_and_b32 s58, s70, 0xff +; VI-NEXT: s_lshl_b32 s89, s90, 8 +; VI-NEXT: s_or_b32 s58, s58, s89 ; VI-NEXT: s_and_b32 s44, s44, 0xffff -; VI-NEXT: s_lshl_b32 s47, s47, 16 -; VI-NEXT: s_or_b32 s44, s44, s47 +; VI-NEXT: s_lshl_b32 s58, s58, 16 +; VI-NEXT: s_or_b32 s44, s44, s58 ; VI-NEXT: v_mov_b32_e32 v1, s44 ; VI-NEXT: s_and_b32 s44, s45, 0xff ; VI-NEXT: s_lshl_b32 s45, s69, 8 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, s68, 0xff -; VI-NEXT: s_lshl_b32 s47, s67, 8 -; VI-NEXT: s_or_b32 s45, s45, s47 +; VI-NEXT: s_lshl_b32 s56, s56, 8 +; VI-NEXT: s_or_b32 s45, s45, s56 ; VI-NEXT: s_and_b32 s44, s44, 0xffff ; VI-NEXT: s_lshl_b32 s45, s45, 16 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: v_mov_b32_e32 v2, s44 -; VI-NEXT: s_lshl_b32 s44, s66, 8 ; VI-NEXT: s_and_b32 s42, s42, 0xff +; VI-NEXT: s_lshl_b32 s44, s79, 8 ; VI-NEXT: s_or_b32 s42, s42, s44 -; VI-NEXT: s_lshl_b32 s44, s38, 8 -; VI-NEXT: s_and_b32 s45, s65, 0xff -; VI-NEXT: s_or_b32 s44, s45, s44 +; VI-NEXT: s_and_b32 s44, s77, 0xff +; VI-NEXT: s_lshl_b32 s45, s38, 8 +; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s42, s42, 0xffff ; VI-NEXT: s_lshl_b32 s44, s44, 16 ; VI-NEXT: s_or_b32 s42, s42, s44 ; VI-NEXT: v_mov_b32_e32 v3, s42 ; VI-NEXT: s_and_b32 s42, s43, 0xff -; VI-NEXT: s_lshl_b32 s43, s54, 8 +; VI-NEXT: s_lshl_b32 s43, s65, 8 ; VI-NEXT: s_or_b32 s42, s42, s43 -; VI-NEXT: s_and_b32 s43, s53, 0xff -; VI-NEXT: s_lshl_b32 s44, s51, 8 +; VI-NEXT: s_and_b32 s43, s64, 0xff +; VI-NEXT: s_lshl_b32 s44, s54, 8 ; VI-NEXT: s_or_b32 s43, s43, s44 ; VI-NEXT: s_and_b32 s42, s42, 0xffff ; VI-NEXT: s_lshl_b32 s43, s43, 16 ; VI-NEXT: s_or_b32 s42, s42, s43 ; VI-NEXT: v_mov_b32_e32 v4, s42 -; VI-NEXT: s_lshl_b32 s42, s86, 8 ; VI-NEXT: s_and_b32 s28, s28, 0xff +; VI-NEXT: s_lshl_b32 s42, s74, 8 ; VI-NEXT: s_or_b32 s28, s28, s42 -; VI-NEXT: s_lshl_b32 s42, s36, 8 -; VI-NEXT: s_and_b32 s43, s84, 0xff -; VI-NEXT: s_or_b32 s42, s43, s42 +; VI-NEXT: s_and_b32 s42, s46, 0xff +; VI-NEXT: s_lshl_b32 s43, s36, 8 +; VI-NEXT: s_or_b32 s42, s42, s43 ; VI-NEXT: s_and_b32 s28, s28, 0xffff ; VI-NEXT: s_lshl_b32 s42, s42, 16 ; VI-NEXT: s_or_b32 s28, s28, s42 ; VI-NEXT: v_mov_b32_e32 v5, s28 ; VI-NEXT: s_and_b32 s28, s29, 0xff -; VI-NEXT: s_lshl_b32 s29, s82, 8 -; VI-NEXT: v_readlane_b32 s42, v21, 25 +; VI-NEXT: s_lshl_b32 s29, s51, 8 ; VI-NEXT: s_or_b32 s28, s28, s29 -; VI-NEXT: s_and_b32 s29, s80, 0xff -; VI-NEXT: s_lshl_b32 s42, s42, 8 +; VI-NEXT: s_and_b32 s29, s73, 0xff +; VI-NEXT: s_lshl_b32 s42, s72, 8 ; VI-NEXT: s_or_b32 s29, s29, s42 ; VI-NEXT: s_and_b32 s28, s28, 0xffff ; VI-NEXT: s_lshl_b32 s29, s29, 16 ; VI-NEXT: s_or_b32 s28, s28, s29 ; VI-NEXT: v_mov_b32_e32 v6, s28 -; VI-NEXT: v_readlane_b32 s28, v21, 24 -; VI-NEXT: s_lshl_b32 s28, s28, 8 ; VI-NEXT: s_and_b32 s26, s26, 0xff -; VI-NEXT: v_readlane_b32 s29, v21, 23 +; VI-NEXT: s_lshl_b32 s28, s63, 8 ; VI-NEXT: s_or_b32 s26, s26, s28 -; VI-NEXT: s_lshl_b32 s28, s34, 8 -; VI-NEXT: s_and_b32 s29, s29, 0xff -; VI-NEXT: s_or_b32 s28, s29, s28 +; VI-NEXT: s_and_b32 s28, s86, 0xff +; VI-NEXT: s_lshl_b32 s29, s34, 8 +; VI-NEXT: s_or_b32 s28, s28, s29 ; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_lshl_b32 s28, s28, 16 ; VI-NEXT: s_or_b32 s26, s26, s28 ; VI-NEXT: v_mov_b32_e32 v7, s26 ; VI-NEXT: s_and_b32 s26, s27, 0xff -; VI-NEXT: v_readlane_b32 s27, v21, 22 -; VI-NEXT: s_lshl_b32 s27, s27, 8 +; VI-NEXT: s_lshl_b32 s27, s60, 8 ; VI-NEXT: s_or_b32 s26, s26, s27 -; VI-NEXT: v_readlane_b32 s27, v21, 21 -; VI-NEXT: v_readlane_b32 s28, v21, 20 -; VI-NEXT: s_and_b32 s27, s27, 0xff -; VI-NEXT: s_lshl_b32 s28, s28, 8 +; VI-NEXT: s_and_b32 s27, s83, 0xff +; VI-NEXT: s_lshl_b32 s28, s71, 8 ; VI-NEXT: s_or_b32 s27, s27, s28 ; VI-NEXT: s_and_b32 s26, s26, 0xffff ; VI-NEXT: s_lshl_b32 s27, s27, 16 ; VI-NEXT: s_or_b32 s26, s26, s27 ; VI-NEXT: v_mov_b32_e32 v8, s26 -; VI-NEXT: v_readlane_b32 s26, v21, 19 -; VI-NEXT: s_lshl_b32 s26, s26, 8 +; VI-NEXT: v_readlane_b32 s26, v21, 33 ; VI-NEXT: s_and_b32 s24, s24, 0xff -; VI-NEXT: v_readlane_b32 s27, v21, 18 +; VI-NEXT: s_lshl_b32 s26, s26, 8 ; VI-NEXT: s_or_b32 s24, s24, s26 -; VI-NEXT: s_lshl_b32 s26, s30, 8 -; VI-NEXT: s_and_b32 s27, s27, 0xff -; VI-NEXT: s_or_b32 s26, s27, s26 +; VI-NEXT: v_readlane_b32 s26, v21, 32 +; VI-NEXT: s_and_b32 s26, s26, 0xff +; VI-NEXT: s_lshl_b32 s27, s30, 8 +; VI-NEXT: s_or_b32 s26, s26, s27 ; VI-NEXT: s_and_b32 s24, s24, 0xffff ; VI-NEXT: s_lshl_b32 s26, s26, 16 ; VI-NEXT: s_or_b32 s24, s24, s26 ; VI-NEXT: v_mov_b32_e32 v9, s24 ; VI-NEXT: s_and_b32 s24, s25, 0xff -; VI-NEXT: v_readlane_b32 s25, v21, 17 +; VI-NEXT: v_readlane_b32 s25, v21, 31 ; VI-NEXT: s_lshl_b32 s25, s25, 8 ; VI-NEXT: s_or_b32 s24, s24, s25 -; VI-NEXT: v_readlane_b32 s25, v21, 16 -; VI-NEXT: v_readlane_b32 s26, v21, 15 +; VI-NEXT: v_readlane_b32 s25, v21, 30 +; VI-NEXT: v_readlane_b32 s26, v21, 29 ; VI-NEXT: s_and_b32 s25, s25, 0xff ; VI-NEXT: s_lshl_b32 s26, s26, 8 ; VI-NEXT: s_or_b32 s25, s25, s26 @@ -216469,24 +215985,25 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s25, s25, 16 ; VI-NEXT: s_or_b32 s24, s24, s25 ; VI-NEXT: v_mov_b32_e32 v10, s24 -; VI-NEXT: v_readlane_b32 s24, v21, 14 -; VI-NEXT: s_lshl_b32 s24, s24, 8 +; VI-NEXT: v_readlane_b32 s24, v21, 28 ; VI-NEXT: s_and_b32 s22, s22, 0xff -; VI-NEXT: v_readlane_b32 s25, v21, 13 +; VI-NEXT: s_lshl_b32 s24, s24, 8 ; VI-NEXT: s_or_b32 s22, s22, s24 -; VI-NEXT: s_lshl_b32 s24, s90, 8 -; VI-NEXT: s_and_b32 s25, s25, 0xff -; VI-NEXT: s_or_b32 s24, s25, s24 +; VI-NEXT: v_readlane_b32 s24, v21, 27 +; VI-NEXT: v_readlane_b32 s26, v21, 0 +; VI-NEXT: s_and_b32 s24, s24, 0xff +; VI-NEXT: s_lshl_b32 s25, s26, 8 +; VI-NEXT: s_or_b32 s24, s24, s25 ; VI-NEXT: s_and_b32 s22, s22, 0xffff ; VI-NEXT: s_lshl_b32 s24, s24, 16 ; VI-NEXT: s_or_b32 s22, s22, s24 ; VI-NEXT: v_mov_b32_e32 v11, s22 ; VI-NEXT: s_and_b32 s22, s23, 0xff -; VI-NEXT: v_readlane_b32 s23, v21, 12 +; VI-NEXT: v_readlane_b32 s23, v21, 26 ; VI-NEXT: s_lshl_b32 s23, s23, 8 ; VI-NEXT: s_or_b32 s22, s22, s23 -; VI-NEXT: v_readlane_b32 s23, v21, 11 -; VI-NEXT: v_readlane_b32 s24, v21, 10 +; VI-NEXT: v_readlane_b32 s23, v21, 25 +; VI-NEXT: v_readlane_b32 s24, v21, 24 ; VI-NEXT: s_and_b32 s23, s23, 0xff ; VI-NEXT: s_lshl_b32 s24, s24, 8 ; VI-NEXT: s_or_b32 s23, s23, s24 @@ -216494,102 +216011,98 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s23, s23, 16 ; VI-NEXT: s_or_b32 s22, s22, s23 ; VI-NEXT: v_mov_b32_e32 v12, s22 -; VI-NEXT: v_readlane_b32 s22, v21, 9 -; VI-NEXT: s_lshl_b32 s22, s22, 8 +; VI-NEXT: v_readlane_b32 s22, v21, 23 ; VI-NEXT: s_and_b32 s20, s20, 0xff -; VI-NEXT: v_readlane_b32 s23, v21, 8 +; VI-NEXT: s_lshl_b32 s22, s22, 8 ; VI-NEXT: s_or_b32 s20, s20, s22 -; VI-NEXT: s_lshl_b32 s22, s88, 8 -; VI-NEXT: s_and_b32 s23, s23, 0xff -; VI-NEXT: s_or_b32 s22, s23, s22 +; VI-NEXT: v_readlane_b32 s22, v21, 22 +; VI-NEXT: v_readlane_b32 s24, v21, 2 +; VI-NEXT: s_and_b32 s22, s22, 0xff +; VI-NEXT: s_lshl_b32 s23, s24, 8 +; VI-NEXT: s_or_b32 s22, s22, s23 ; VI-NEXT: s_and_b32 s20, s20, 0xffff ; VI-NEXT: s_lshl_b32 s22, s22, 16 -; VI-NEXT: s_or_b32 s20, s20, s22 -; VI-NEXT: v_mov_b32_e32 v13, s20 -; VI-NEXT: s_and_b32 s20, s21, 0xff -; VI-NEXT: v_readlane_b32 s21, v21, 7 -; VI-NEXT: s_lshl_b32 s21, s21, 8 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: v_readlane_b32 s21, v21, 6 -; VI-NEXT: v_readlane_b32 s22, v21, 5 -; VI-NEXT: s_and_b32 s21, s21, 0xff -; VI-NEXT: s_lshl_b32 s22, s22, 8 -; VI-NEXT: s_or_b32 s21, s21, s22 -; VI-NEXT: s_and_b32 s20, s20, 0xffff -; VI-NEXT: s_lshl_b32 s21, s21, 16 -; VI-NEXT: s_or_b32 s20, s20, s21 -; VI-NEXT: v_mov_b32_e32 v14, s20 -; VI-NEXT: v_readlane_b32 s20, v21, 4 -; VI-NEXT: s_lshl_b32 s20, s20, 8 -; VI-NEXT: s_and_b32 s18, s18, 0xff -; VI-NEXT: v_readlane_b32 s21, v21, 3 -; VI-NEXT: s_or_b32 s18, s18, s20 -; VI-NEXT: s_lshl_b32 s20, s76, 8 -; VI-NEXT: s_and_b32 s21, s21, 0xff -; VI-NEXT: s_or_b32 s20, s21, s20 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 -; VI-NEXT: s_and_b32 s18, s18, 0xffff -; VI-NEXT: s_lshl_b32 s20, s20, 16 +; VI-NEXT: s_or_b32 s20, s20, s22 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 -; VI-NEXT: s_or_b32 s18, s18, s20 +; VI-NEXT: v_mov_b32_e32 v13, s20 +; VI-NEXT: s_and_b32 s20, s21, 0xff +; VI-NEXT: v_readlane_b32 s21, v21, 21 ; VI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 -; VI-NEXT: v_mov_b32_e32 v15, s18 -; VI-NEXT: s_and_b32 s18, s19, 0xff -; VI-NEXT: v_readlane_b32 s19, v21, 2 +; VI-NEXT: s_lshl_b32 s21, s21, 8 ; VI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; VI-NEXT: s_lshl_b32 s19, s19, 8 +; VI-NEXT: s_or_b32 s20, s20, s21 ; VI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s21, s91, 0xff +; VI-NEXT: s_lshl_b32 s22, s48, 8 ; VI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; VI-NEXT: v_readlane_b32 s19, v21, 1 -; VI-NEXT: v_readlane_b32 s20, v21, 0 +; VI-NEXT: s_or_b32 s21, s21, s22 ; VI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 -; VI-NEXT: s_and_b32 s19, s19, 0xff -; VI-NEXT: s_lshl_b32 s20, s20, 8 +; VI-NEXT: s_and_b32 s20, s20, 0xffff +; VI-NEXT: s_lshl_b32 s21, s21, 16 ; VI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; VI-NEXT: s_or_b32 s19, s19, s20 +; VI-NEXT: s_or_b32 s20, s20, s21 ; VI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; VI-NEXT: s_and_b32 s18, s18, 0xffff -; VI-NEXT: s_lshl_b32 s19, s19, 16 +; VI-NEXT: v_mov_b32_e32 v2, s20 +; VI-NEXT: s_and_b32 s18, s18, 0xff +; VI-NEXT: s_lshl_b32 s20, s39, 8 +; VI-NEXT: v_readlane_b32 s22, v21, 4 ; VI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 -; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_or_b32 s18, s18, s20 +; VI-NEXT: s_and_b32 s20, s37, 0xff +; VI-NEXT: s_lshl_b32 s21, s22, 8 ; VI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: s_and_b32 s18, s40, 0xff -; VI-NEXT: s_lshl_b32 s19, s64, 8 +; VI-NEXT: s_or_b32 s20, s20, s21 ; VI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 -; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, s55, 0xff -; VI-NEXT: s_lshl_b32 s20, s78, 8 +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s20, s20, 16 ; VI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s18, s18, s20 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b32 s18, s19, 0xff +; VI-NEXT: s_lshl_b32 s19, s35, 8 +; VI-NEXT: v_readlane_b32 s20, v21, 20 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, s31, 0xff +; VI-NEXT: s_lshl_b32 s20, s20, 8 ; VI-NEXT: s_or_b32 s19, s19, s20 -; VI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; VI-NEXT: s_and_b32 s18, s18, 0xffff +; VI-NEXT: s_lshl_b32 s19, s19, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: s_and_b32 s18, s40, 0xff +; VI-NEXT: s_lshl_b32 s19, s57, 8 +; VI-NEXT: s_or_b32 s18, s18, s19 +; VI-NEXT: s_and_b32 s19, s67, 0xff +; VI-NEXT: s_lshl_b32 s20, s88, 8 +; VI-NEXT: s_or_b32 s19, s19, s20 ; VI-NEXT: s_and_b32 s18, s18, 0xffff ; VI-NEXT: s_lshl_b32 s19, s19, 16 -; VI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v0 ; VI-NEXT: s_or_b32 s18, s18, s19 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: s_and_b32 s18, s41, 0xff -; VI-NEXT: s_lshl_b32 s19, s52, 8 +; VI-NEXT: s_lshl_b32 s19, s78, 8 ; VI-NEXT: s_or_b32 s18, s18, s19 -; VI-NEXT: s_and_b32 s19, s50, 0xff -; VI-NEXT: s_lshl_b32 s20, s87, 8 +; VI-NEXT: s_and_b32 s19, s76, 0xff +; VI-NEXT: s_lshl_b32 s20, s66, 8 ; VI-NEXT: s_or_b32 s19, s19, s20 ; VI-NEXT: s_and_b32 s18, s18, 0xffff ; VI-NEXT: s_lshl_b32 s19, s19, 16 @@ -216598,10 +216111,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: s_and_b32 s16, s16, 0xff -; VI-NEXT: s_lshl_b32 s18, s85, 8 +; VI-NEXT: s_lshl_b32 s18, s55, 8 +; VI-NEXT: v_readlane_b32 s20, v21, 6 ; VI-NEXT: s_or_b32 s16, s16, s18 -; VI-NEXT: s_and_b32 s18, s83, 0xff -; VI-NEXT: s_lshl_b32 s19, s74, 8 +; VI-NEXT: s_and_b32 s18, s75, 0xff +; VI-NEXT: s_lshl_b32 s19, s20, 8 ; VI-NEXT: s_or_b32 s18, s18, s19 ; VI-NEXT: s_and_b32 s16, s16, 0xffff ; VI-NEXT: s_lshl_b32 s18, s18, 16 @@ -216610,12 +216124,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s16 ; VI-NEXT: s_and_b32 s16, s17, 0xff -; VI-NEXT: s_lshl_b32 s17, s81, 8 +; VI-NEXT: s_lshl_b32 s17, s47, 8 ; VI-NEXT: s_or_b32 s16, s16, s17 -; VI-NEXT: v_readlane_b32 s17, v21, 57 -; VI-NEXT: v_readlane_b32 s18, v21, 56 -; VI-NEXT: s_and_b32 s17, s17, 0xff -; VI-NEXT: s_lshl_b32 s18, s18, 8 +; VI-NEXT: s_and_b32 s17, s53, 0xff +; VI-NEXT: s_lshl_b32 s18, s52, 8 ; VI-NEXT: s_or_b32 s17, s17, s18 ; VI-NEXT: s_and_b32 s16, s16, 0xffff ; VI-NEXT: s_lshl_b32 s17, s17, 16 @@ -216623,13 +216135,12 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s16 -; VI-NEXT: v_readlane_b32 s16, v21, 55 ; VI-NEXT: s_and_b32 s14, s14, 0xff -; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_lshl_b32 s16, s50, 8 +; VI-NEXT: v_readlane_b32 s18, v21, 8 ; VI-NEXT: s_or_b32 s14, s14, s16 -; VI-NEXT: v_readlane_b32 s16, v21, 54 -; VI-NEXT: s_and_b32 s16, s16, 0xff -; VI-NEXT: s_lshl_b32 s17, s72, 8 +; VI-NEXT: s_and_b32 s16, s87, 0xff +; VI-NEXT: s_lshl_b32 s17, s18, 8 ; VI-NEXT: s_or_b32 s16, s16, s17 ; VI-NEXT: s_and_b32 s14, s14, 0xffff ; VI-NEXT: s_lshl_b32 s16, s16, 16 @@ -216638,13 +216149,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: s_and_b32 s14, s15, 0xff -; VI-NEXT: v_readlane_b32 s15, v21, 53 -; VI-NEXT: s_lshl_b32 s15, s15, 8 +; VI-NEXT: s_lshl_b32 s15, s62, 8 ; VI-NEXT: s_or_b32 s14, s14, s15 -; VI-NEXT: v_readlane_b32 s15, v21, 52 -; VI-NEXT: v_readlane_b32 s16, v21, 51 -; VI-NEXT: s_and_b32 s15, s15, 0xff -; VI-NEXT: s_lshl_b32 s16, s16, 8 +; VI-NEXT: s_and_b32 s15, s85, 0xff +; VI-NEXT: s_lshl_b32 s16, s61, 8 ; VI-NEXT: s_or_b32 s15, s15, s16 ; VI-NEXT: s_and_b32 s14, s14, 0xffff ; VI-NEXT: s_lshl_b32 s15, s15, 16 @@ -216652,13 +216160,12 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s14, s14, s15 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_readlane_b32 s14, v21, 50 ; VI-NEXT: s_and_b32 s12, s12, 0xff -; VI-NEXT: s_lshl_b32 s14, s14, 8 +; VI-NEXT: s_lshl_b32 s14, s84, 8 +; VI-NEXT: v_readlane_b32 s16, v21, 10 ; VI-NEXT: s_or_b32 s12, s12, s14 -; VI-NEXT: v_readlane_b32 s14, v21, 49 -; VI-NEXT: s_and_b32 s14, s14, 0xff -; VI-NEXT: s_lshl_b32 s15, s62, 8 +; VI-NEXT: s_and_b32 s14, s82, 0xff +; VI-NEXT: s_lshl_b32 s15, s16, 8 ; VI-NEXT: s_or_b32 s14, s14, s15 ; VI-NEXT: s_and_b32 s12, s12, 0xffff ; VI-NEXT: s_lshl_b32 s14, s14, 16 @@ -216667,11 +216174,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s12 ; VI-NEXT: s_and_b32 s12, s13, 0xff -; VI-NEXT: v_readlane_b32 s13, v21, 48 -; VI-NEXT: s_lshl_b32 s13, s13, 8 +; VI-NEXT: s_lshl_b32 s13, s59, 8 ; VI-NEXT: s_or_b32 s12, s12, s13 -; VI-NEXT: v_readlane_b32 s13, v21, 47 -; VI-NEXT: v_readlane_b32 s14, v21, 46 +; VI-NEXT: v_readlane_b32 s13, v21, 52 +; VI-NEXT: v_readlane_b32 s14, v21, 51 ; VI-NEXT: s_and_b32 s13, s13, 0xff ; VI-NEXT: s_lshl_b32 s14, s14, 8 ; VI-NEXT: s_or_b32 s13, s13, s14 @@ -216681,13 +216187,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s12 -; VI-NEXT: v_readlane_b32 s12, v21, 45 +; VI-NEXT: v_readlane_b32 s12, v21, 50 ; VI-NEXT: s_and_b32 s10, s10, 0xff ; VI-NEXT: s_lshl_b32 s12, s12, 8 ; VI-NEXT: s_or_b32 s10, s10, s12 -; VI-NEXT: v_readlane_b32 s12, v21, 44 +; VI-NEXT: v_readlane_b32 s12, v21, 49 +; VI-NEXT: v_readlane_b32 s14, v21, 12 ; VI-NEXT: s_and_b32 s12, s12, 0xff -; VI-NEXT: s_lshl_b32 s13, s60, 8 +; VI-NEXT: s_lshl_b32 s13, s14, 8 ; VI-NEXT: s_or_b32 s12, s12, s13 ; VI-NEXT: s_and_b32 s10, s10, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 @@ -216696,11 +216203,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: s_and_b32 s10, s11, 0xff -; VI-NEXT: v_readlane_b32 s11, v21, 43 +; VI-NEXT: v_readlane_b32 s11, v21, 48 ; VI-NEXT: s_lshl_b32 s11, s11, 8 ; VI-NEXT: s_or_b32 s10, s10, s11 -; VI-NEXT: v_readlane_b32 s11, v21, 42 -; VI-NEXT: v_readlane_b32 s12, v21, 41 +; VI-NEXT: v_readlane_b32 s11, v21, 47 +; VI-NEXT: v_readlane_b32 s12, v21, 46 ; VI-NEXT: s_and_b32 s11, s11, 0xff ; VI-NEXT: s_lshl_b32 s12, s12, 8 ; VI-NEXT: s_or_b32 s11, s11, s12 @@ -216710,13 +216217,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_readlane_b32 s10, v21, 40 +; VI-NEXT: v_readlane_b32 s10, v21, 45 ; VI-NEXT: s_and_b32 s8, s8, 0xff ; VI-NEXT: s_lshl_b32 s10, s10, 8 ; VI-NEXT: s_or_b32 s8, s8, s10 -; VI-NEXT: v_readlane_b32 s10, v21, 39 +; VI-NEXT: v_readlane_b32 s10, v21, 44 +; VI-NEXT: v_readlane_b32 s12, v21, 14 ; VI-NEXT: s_and_b32 s10, s10, 0xff -; VI-NEXT: s_lshl_b32 s11, s58, 8 +; VI-NEXT: s_lshl_b32 s11, s12, 8 ; VI-NEXT: s_or_b32 s10, s10, s11 ; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 @@ -216725,11 +216233,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: s_and_b32 s8, s9, 0xff -; VI-NEXT: v_readlane_b32 s9, v21, 38 +; VI-NEXT: v_readlane_b32 s9, v21, 43 ; VI-NEXT: s_lshl_b32 s9, s9, 8 ; VI-NEXT: s_or_b32 s8, s8, s9 -; VI-NEXT: v_readlane_b32 s9, v21, 37 -; VI-NEXT: v_readlane_b32 s10, v21, 36 +; VI-NEXT: v_readlane_b32 s9, v21, 42 +; VI-NEXT: v_readlane_b32 s10, v21, 41 ; VI-NEXT: s_and_b32 s9, s9, 0xff ; VI-NEXT: s_lshl_b32 s10, s10, 8 ; VI-NEXT: s_or_b32 s9, s9, s10 @@ -216739,13 +216247,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_readlane_b32 s8, v21, 35 +; VI-NEXT: v_readlane_b32 s8, v21, 40 ; VI-NEXT: s_and_b32 s6, s6, 0xff ; VI-NEXT: s_lshl_b32 s8, s8, 8 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_readlane_b32 s8, v21, 34 +; VI-NEXT: v_readlane_b32 s8, v21, 39 +; VI-NEXT: v_readlane_b32 s10, v21, 16 ; VI-NEXT: s_and_b32 s8, s8, 0xff -; VI-NEXT: s_lshl_b32 s9, s56, 8 +; VI-NEXT: s_lshl_b32 s9, s10, 8 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 @@ -216754,11 +216263,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_and_b32 s6, s7, 0xff -; VI-NEXT: v_readlane_b32 s7, v21, 33 +; VI-NEXT: v_readlane_b32 s7, v21, 38 ; VI-NEXT: s_lshl_b32 s7, s7, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_readlane_b32 s7, v21, 32 -; VI-NEXT: v_readlane_b32 s8, v21, 31 +; VI-NEXT: v_readlane_b32 s7, v21, 37 +; VI-NEXT: v_readlane_b32 s8, v21, 36 ; VI-NEXT: s_and_b32 s7, s7, 0xff ; VI-NEXT: s_lshl_b32 s8, s8, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 @@ -216768,13 +216277,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_readlane_b32 s6, v21, 30 +; VI-NEXT: v_readlane_b32 s6, v21, 35 ; VI-NEXT: s_and_b32 s4, s4, 0xff ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_readlane_b32 s6, v21, 29 +; VI-NEXT: v_readlane_b32 s6, v21, 34 +; VI-NEXT: v_readlane_b32 s8, v21, 18 ; VI-NEXT: s_and_b32 s6, s6, 0xff -; VI-NEXT: s_lshl_b32 s7, s46, 8 +; VI-NEXT: s_lshl_b32 s7, s8, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 @@ -216783,13 +216293,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_and_b32 s4, s5, 0xff -; VI-NEXT: v_readlane_b32 s5, v21, 28 -; VI-NEXT: s_lshl_b32 s5, s5, 8 +; VI-NEXT: s_lshl_b32 s5, s80, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_readlane_b32 s5, v21, 27 -; VI-NEXT: v_readlane_b32 s6, v21, 26 -; VI-NEXT: s_and_b32 s5, s5, 0xff -; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_and_b32 s5, s81, 0xff +; VI-NEXT: s_lshl_b32 s6, s49, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 @@ -216798,6 +216305,16 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_readlane_b32 s27, v21, 1 +; VI-NEXT: v_readlane_b32 s25, v21, 3 +; VI-NEXT: v_readlane_b32 s23, v21, 5 +; VI-NEXT: v_readlane_b32 s21, v21, 7 +; VI-NEXT: v_readlane_b32 s19, v21, 9 +; VI-NEXT: v_readlane_b32 s17, v21, 11 +; VI-NEXT: v_readlane_b32 s15, v21, 13 +; VI-NEXT: v_readlane_b32 s13, v21, 15 +; VI-NEXT: v_readlane_b32 s11, v21, 17 +; VI-NEXT: v_readlane_b32 s9, v21, 19 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_readlane_b32 s87, v20, 31 ; VI-NEXT: v_readlane_b32 s86, v20, 30 @@ -216838,168 +216355,163 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB99_4: -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: v_writelane_b32 v21, s62, 0 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: v_writelane_b32 v21, s63, 1 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: v_writelane_b32 v21, s62, 2 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: v_writelane_b32 v21, s63, 3 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: v_writelane_b32 v21, s62, 4 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: v_writelane_b32 v21, s63, 5 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: v_writelane_b32 v21, s62, 6 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: v_writelane_b32 v21, s63, 7 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: v_writelane_b32 v21, s62, 8 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: v_writelane_b32 v21, s63, 9 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $vcc_lo +; VI-NEXT: v_writelane_b32 v21, vcc_lo, 10 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: v_writelane_b32 v21, vcc_hi, 11 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $vcc_lo +; VI-NEXT: v_writelane_b32 v21, vcc_lo, 12 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: v_writelane_b32 v21, vcc_hi, 13 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $vcc_lo +; VI-NEXT: v_writelane_b32 v21, vcc_lo, 14 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: v_writelane_b32 v21, vcc_hi, 15 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $vcc_lo +; VI-NEXT: v_writelane_b32 v21, vcc_lo, 16 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: v_writelane_b32 v21, vcc_hi, 17 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; implicit-def: $vcc_lo +; VI-NEXT: v_writelane_b32 v21, vcc_lo, 18 +; VI-NEXT: ; kill: killed $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: v_writelane_b32 v21, vcc_hi, 19 +; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr90 ; VI-NEXT: ; implicit-def: $sgpr69 ; VI-NEXT: ; implicit-def: $sgpr68 -; VI-NEXT: ; implicit-def: $sgpr67 -; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr79 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr38 ; VI-NEXT: ; implicit-def: $sgpr65 +; VI-NEXT: ; implicit-def: $sgpr64 ; VI-NEXT: ; implicit-def: $sgpr54 -; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr36 ; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 ; VI-NEXT: ; implicit-def: $sgpr86 -; VI-NEXT: ; implicit-def: $sgpr84 -; VI-NEXT: ; implicit-def: $sgpr82 -; VI-NEXT: ; implicit-def: $sgpr80 -; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr34 +; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr91 +; VI-NEXT: ; implicit-def: $sgpr48 +; VI-NEXT: ; implicit-def: $sgpr39 +; VI-NEXT: ; implicit-def: $sgpr37 +; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr31 +; VI-NEXT: ; kill: killed $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr53 ; VI-NEXT: ; implicit-def: $sgpr52 ; VI-NEXT: ; implicit-def: $sgpr50 ; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: ; implicit-def: $sgpr85 -; VI-NEXT: ; implicit-def: $sgpr83 +; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr84 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr89 +; VI-NEXT: ; kill: killed $sgpr89 +; VI-NEXT: ; implicit-def: $sgpr80 ; VI-NEXT: ; implicit-def: $sgpr81 -; VI-NEXT: ; implicit-def: $sgpr48 -; VI-NEXT: ; implicit-def: $sgpr38 -; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr30 -; VI-NEXT: ; implicit-def: $sgpr90 -; VI-NEXT: ; implicit-def: $sgpr88 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr49 ; VI-NEXT: s_branch .LBB99_2 ; ; GFX9-LABEL: bitcast_v64i16_to_v128i8_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v63, s30, 0 ; GFX9-NEXT: v_writelane_b32 v63, s31, 1 @@ -217182,26 +216694,30 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_writelane_b32 v62, s26, 11 ; GFX9-NEXT: s_lshr_b32 s26, s18, 8 ; GFX9-NEXT: v_writelane_b32 v62, s26, 10 -; GFX9-NEXT: s_lshr_b32 s26, s21, 24 -; GFX9-NEXT: v_writelane_b32 v62, s26, 9 -; GFX9-NEXT: s_lshr_b32 s26, s21, 16 -; GFX9-NEXT: v_writelane_b32 v62, s26, 8 -; GFX9-NEXT: s_lshr_b32 s26, s21, 8 -; GFX9-NEXT: v_writelane_b32 v62, s26, 7 -; GFX9-NEXT: s_lshr_b32 s26, s20, 16 -; GFX9-NEXT: v_writelane_b32 v62, s26, 6 -; GFX9-NEXT: s_lshr_b32 s26, s20, 8 -; GFX9-NEXT: v_writelane_b32 v62, s26, 5 -; GFX9-NEXT: s_lshr_b32 s26, s23, 24 -; GFX9-NEXT: v_writelane_b32 v62, s26, 4 -; GFX9-NEXT: s_lshr_b32 s26, s23, 16 -; GFX9-NEXT: v_writelane_b32 v62, s26, 3 -; GFX9-NEXT: s_lshr_b32 s26, s23, 8 -; GFX9-NEXT: v_writelane_b32 v62, s26, 2 -; GFX9-NEXT: s_lshr_b32 s26, s22, 16 -; GFX9-NEXT: v_writelane_b32 v62, s26, 1 -; GFX9-NEXT: s_lshr_b32 s26, s22, 8 -; GFX9-NEXT: v_writelane_b32 v62, s26, 0 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[10:11], 24 +; GFX9-NEXT: v_writelane_b32 v62, s78, 8 +; GFX9-NEXT: v_writelane_b32 v62, s79, 9 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 +; GFX9-NEXT: v_writelane_b32 v62, s78, 6 +; GFX9-NEXT: v_writelane_b32 v62, s79, 7 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[14:15], 24 +; GFX9-NEXT: v_writelane_b32 v62, s78, 4 +; GFX9-NEXT: v_writelane_b32 v62, s79, 5 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[16:17], 24 +; GFX9-NEXT: v_writelane_b32 v62, s78, 2 +; GFX9-NEXT: v_writelane_b32 v62, s79, 3 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[18:19], 24 +; GFX9-NEXT: v_writelane_b32 v62, s78, 0 +; GFX9-NEXT: s_lshr_b32 s64, s21, 24 +; GFX9-NEXT: s_lshr_b32 s65, s21, 16 +; GFX9-NEXT: s_lshr_b32 s67, s21, 8 +; GFX9-NEXT: s_lshr_b32 s66, s20, 16 +; GFX9-NEXT: s_lshr_b32 s68, s20, 8 +; GFX9-NEXT: s_lshr_b32 s69, s23, 24 +; GFX9-NEXT: s_lshr_b32 s70, s23, 16 +; GFX9-NEXT: s_lshr_b32 s80, s23, 8 +; GFX9-NEXT: s_lshr_b32 s71, s22, 16 +; GFX9-NEXT: s_lshr_b32 s81, s22, 8 ; GFX9-NEXT: s_lshr_b32 s82, s25, 24 ; GFX9-NEXT: s_lshr_b32 s83, s25, 16 ; GFX9-NEXT: s_lshr_b32 s85, s25, 8 @@ -217222,327 +216738,314 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: s_lshr_b32 s54, s45, 8 ; GFX9-NEXT: s_lshr_b32 s53, s44, 16 ; GFX9-NEXT: s_lshr_b32 s55, s44, 8 -; GFX9-NEXT: s_lshr_b32 s64, s47, 24 -; GFX9-NEXT: s_lshr_b32 s65, s47, 16 -; GFX9-NEXT: s_lshr_b32 s67, s47, 8 -; GFX9-NEXT: s_lshr_b32 s66, s46, 16 -; GFX9-NEXT: s_lshr_b32 s68, s46, 8 -; GFX9-NEXT: s_lshr_b32 s69, s57, 24 -; GFX9-NEXT: s_lshr_b32 s70, s57, 16 -; GFX9-NEXT: s_lshr_b32 s80, s57, 8 -; GFX9-NEXT: s_lshr_b32 s71, s56, 16 -; GFX9-NEXT: s_lshr_b32 s81, s56, 8 -; GFX9-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 -; GFX9-NEXT: s_lshr_b64 s[28:29], s[6:7], 24 -; GFX9-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 -; GFX9-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; GFX9-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; GFX9-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; GFX9-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; GFX9-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 -; GFX9-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 -; GFX9-NEXT: s_lshr_b64 s[88:89], s[22:23], 24 -; GFX9-NEXT: s_lshr_b64 s[90:91], s[24:25], 24 -; GFX9-NEXT: s_lshr_b64 s[92:93], s[40:41], 24 -; GFX9-NEXT: s_lshr_b64 s[94:95], s[42:43], 24 -; GFX9-NEXT: s_lshr_b64 s[30:31], s[44:45], 24 -; GFX9-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 -; GFX9-NEXT: s_lshr_b64 s[36:37], s[56:57], 24 +; GFX9-NEXT: s_lshr_b32 s26, s47, 24 +; GFX9-NEXT: s_lshr_b32 s27, s47, 16 +; GFX9-NEXT: s_lshr_b32 s29, s47, 8 +; GFX9-NEXT: s_lshr_b32 s28, s46, 16 +; GFX9-NEXT: s_lshr_b32 s58, s46, 8 +; GFX9-NEXT: s_lshr_b32 s59, s57, 24 +; GFX9-NEXT: s_lshr_b32 s60, s57, 16 +; GFX9-NEXT: s_lshr_b32 s62, s57, 8 +; GFX9-NEXT: s_lshr_b32 s61, s56, 16 +; GFX9-NEXT: s_lshr_b32 s63, s56, 8 +; GFX9-NEXT: s_lshr_b64 s[76:77], s[4:5], 24 +; GFX9-NEXT: s_lshr_b64 s[74:75], s[6:7], 24 +; GFX9-NEXT: s_lshr_b64 s[72:73], s[8:9], 24 +; GFX9-NEXT: v_writelane_b32 v62, s79, 1 +; GFX9-NEXT: s_lshr_b64 s[36:37], s[20:21], 24 +; GFX9-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; GFX9-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; GFX9-NEXT: s_lshr_b64 s[94:95], s[40:41], 24 +; GFX9-NEXT: s_lshr_b64 s[92:93], s[42:43], 24 +; GFX9-NEXT: s_lshr_b64 s[90:91], s[44:45], 24 +; GFX9-NEXT: s_lshr_b64 s[88:89], s[46:47], 24 +; GFX9-NEXT: s_lshr_b64 s[78:79], s[56:57], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB99_4 ; GFX9-NEXT: .LBB99_2: ; %cmp.true -; GFX9-NEXT: v_pk_add_u16 v26, s5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v25, s4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[25:26] -; GFX9-NEXT: v_pk_add_u16 v28, s7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[27:28] -; GFX9-NEXT: v_pk_add_u16 v30, s9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v29, s8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[29:30] -; GFX9-NEXT: v_pk_add_u16 v32, s11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v31, s10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[31:32] -; GFX9-NEXT: v_pk_add_u16 v34, s13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v33, s12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_pk_add_u16 v18, s5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; GFX9-NEXT: v_pk_add_u16 v20, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[33:34] -; GFX9-NEXT: v_pk_add_u16 v36, s15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v35, s14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[35:36] -; GFX9-NEXT: v_pk_add_u16 v38, s17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v37, s16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[37:38] -; GFX9-NEXT: v_pk_add_u16 v49, s19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v48, s18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[48:49] +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; GFX9-NEXT: v_pk_add_u16 v22, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX9-NEXT: v_pk_add_u16 v24, s11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, s10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; GFX9-NEXT: v_pk_add_u16 v26, s13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, s12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; GFX9-NEXT: v_pk_add_u16 v28, s15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, s14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] ; GFX9-NEXT: v_pk_add_u16 v2, s21, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] ; GFX9-NEXT: v_pk_add_u16 v4, s23, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v3, s22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] ; GFX9-NEXT: v_pk_add_u16 v6, s25, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v5, s24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] ; GFX9-NEXT: v_pk_add_u16 v8, s41, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, s40, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] ; GFX9-NEXT: v_pk_add_u16 v10, s43, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v9, s42, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] ; GFX9-NEXT: v_pk_add_u16 v12, s45, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, s44, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3 -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] ; GFX9-NEXT: v_pk_add_u16 v14, s47, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s46, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6 -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5 -; GFX9-NEXT: v_pk_add_u16 v22, s57, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, s56, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8 -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v7 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v10 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v28 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v27 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v30 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v12 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v30 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v29 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v12 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v32 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v32 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v11 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v31 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v14 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v34 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v14 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v33 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v13 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v36 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v13 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v36 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v22 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v35 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v22 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v22 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v38 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v36 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v38 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v4 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; GFX9-NEXT: v_pk_add_u16 v16, s57, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, s56, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v26 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v2 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v25 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v28 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v1 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v28 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v4 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v28 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v4 +; GFX9-NEXT: v_pk_add_u16 v30, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v27 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v30 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; GFX9-NEXT: v_pk_add_u16 v29, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v30 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v5 +; GFX9-NEXT: v_pk_add_u16 v32, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v29 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v32 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v32 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v8 +; GFX9-NEXT: v_pk_add_u16 v31, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v32 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v7 +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v3 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v13 +; GFX9-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v7 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v31 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v12 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v13 +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v15 ; GFX9-NEXT: s_branch .LBB99_5 ; GFX9-NEXT: .LBB99_3: ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr81 -; GFX9-NEXT: ; implicit-def: $sgpr71 -; GFX9-NEXT: ; implicit-def: $sgpr80 -; GFX9-NEXT: ; implicit-def: $sgpr70 -; GFX9-NEXT: ; implicit-def: $sgpr69 -; GFX9-NEXT: ; implicit-def: $sgpr68 -; GFX9-NEXT: ; implicit-def: $sgpr66 -; GFX9-NEXT: ; implicit-def: $sgpr67 -; GFX9-NEXT: ; implicit-def: $sgpr65 -; GFX9-NEXT: ; implicit-def: $sgpr64 +; GFX9-NEXT: ; implicit-def: $sgpr72 +; GFX9-NEXT: ; implicit-def: $sgpr74 +; GFX9-NEXT: ; implicit-def: $sgpr76 +; GFX9-NEXT: ; implicit-def: $sgpr63 +; GFX9-NEXT: ; implicit-def: $sgpr61 +; GFX9-NEXT: ; implicit-def: $sgpr78 +; GFX9-NEXT: ; implicit-def: $sgpr62 +; GFX9-NEXT: ; implicit-def: $sgpr60 +; GFX9-NEXT: ; implicit-def: $sgpr59 +; GFX9-NEXT: ; implicit-def: $sgpr58 +; GFX9-NEXT: ; implicit-def: $sgpr28 +; GFX9-NEXT: ; implicit-def: $sgpr88 +; GFX9-NEXT: ; implicit-def: $sgpr29 ; GFX9-NEXT: ; implicit-def: $sgpr55 ; GFX9-NEXT: ; implicit-def: $sgpr53 +; GFX9-NEXT: ; implicit-def: $sgpr90 ; GFX9-NEXT: ; implicit-def: $sgpr54 ; GFX9-NEXT: ; implicit-def: $sgpr52 ; GFX9-NEXT: ; implicit-def: $sgpr51 ; GFX9-NEXT: ; implicit-def: $sgpr50 ; GFX9-NEXT: ; implicit-def: $sgpr48 +; GFX9-NEXT: ; implicit-def: $sgpr92 ; GFX9-NEXT: ; implicit-def: $sgpr49 ; GFX9-NEXT: ; implicit-def: $sgpr39 ; GFX9-NEXT: ; implicit-def: $sgpr38 ; GFX9-NEXT: ; implicit-def: $sgpr99 ; GFX9-NEXT: ; implicit-def: $sgpr97 +; GFX9-NEXT: ; implicit-def: $sgpr94 ; GFX9-NEXT: ; implicit-def: $sgpr98 ; GFX9-NEXT: ; implicit-def: $sgpr96 ; GFX9-NEXT: ; implicit-def: $sgpr87 ; GFX9-NEXT: ; implicit-def: $sgpr86 ; GFX9-NEXT: ; implicit-def: $sgpr84 +; GFX9-NEXT: ; implicit-def: $sgpr30 ; GFX9-NEXT: ; implicit-def: $sgpr85 ; GFX9-NEXT: ; implicit-def: $sgpr83 ; GFX9-NEXT: ; implicit-def: $sgpr82 -; GFX9-NEXT: ; implicit-def: $sgpr76 -; GFX9-NEXT: ; implicit-def: $sgpr74 -; GFX9-NEXT: ; implicit-def: $sgpr72 -; GFX9-NEXT: ; implicit-def: $sgpr62 -; GFX9-NEXT: ; implicit-def: $sgpr60 -; GFX9-NEXT: ; implicit-def: $sgpr58 -; GFX9-NEXT: ; implicit-def: $sgpr28 -; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr71 ; GFX9-NEXT: ; implicit-def: $sgpr34 -; GFX9-NEXT: ; implicit-def: $sgpr30 -; GFX9-NEXT: ; implicit-def: $sgpr94 -; GFX9-NEXT: ; implicit-def: $sgpr92 -; GFX9-NEXT: ; implicit-def: $sgpr90 -; GFX9-NEXT: ; implicit-def: $sgpr88 -; GFX9-NEXT: ; implicit-def: $sgpr78 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 -; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr80 +; GFX9-NEXT: ; implicit-def: $sgpr70 +; GFX9-NEXT: ; implicit-def: $sgpr69 +; GFX9-NEXT: ; implicit-def: $sgpr68 +; GFX9-NEXT: ; implicit-def: $sgpr66 +; GFX9-NEXT: ; implicit-def: $sgpr36 +; GFX9-NEXT: ; implicit-def: $sgpr67 +; GFX9-NEXT: ; implicit-def: $sgpr65 +; GFX9-NEXT: ; implicit-def: $sgpr64 ; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s26, 0 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s27, 1 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 @@ -217552,9 +217055,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s26, 2 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s27, 3 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 @@ -217564,9 +217068,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s26, 4 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s27, 5 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 @@ -217576,9 +217081,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s26, 6 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s27, 7 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 @@ -217588,11 +217094,13 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 -; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s26, 8 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: v_writelane_b32 v62, s27, 9 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 +; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: ; kill: killed $sgpr26 ; GFX9-NEXT: ; implicit-def: $sgpr26 @@ -217628,254 +217136,233 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: s_branch .LBB99_2 ; GFX9-NEXT: .LBB99_4: -; GFX9-NEXT: v_mov_b32_e32 v15, s71 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s80 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s70 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s69 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s68 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s66 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s67 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s65 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s64 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s55 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s53 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s54 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s52 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s51 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s50 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s48 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s49 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s39 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s38 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s99 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s97 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s98 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s96 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s87 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s86 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s84 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s85 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s83 -; GFX9-NEXT: v_mov_b32_e32 v25, s4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s82 -; GFX9-NEXT: v_readlane_b32 s4, v62, 0 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 1 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 2 -; GFX9-NEXT: v_mov_b32_e32 v19, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 3 -; GFX9-NEXT: v_mov_b32_e32 v55, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 5 -; GFX9-NEXT: v_mov_b32_e32 v53, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 6 -; GFX9-NEXT: v_mov_b32_e32 v52, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 7 -; GFX9-NEXT: v_mov_b32_e32 v51, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 8 -; GFX9-NEXT: v_mov_b32_e32 v50, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 9 -; GFX9-NEXT: v_mov_b32_e32 v24, s4 +; GFX9-NEXT: v_mov_b32_e32 v38, s51 +; GFX9-NEXT: v_mov_b32_e32 v37, s76 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, s74 +; GFX9-NEXT: v_mov_b32_e32 v17, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 10 -; GFX9-NEXT: v_mov_b32_e32 v20, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 11 ; GFX9-NEXT: v_mov_b32_e32 v42, s4 +; GFX9-NEXT: v_mov_b32_e32 v41, s94 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s62 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s59 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s58 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s53 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s39 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s38 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s99 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s97 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s98 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s96 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s87 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s86 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s84 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s85 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s83 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s82 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s81 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s71 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s80 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s70 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s69 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s68 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s66 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s67 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s65 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s64 +; GFX9-NEXT: v_readlane_b32 s4, v62, 11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 12 -; GFX9-NEXT: v_mov_b32_e32 v18, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 13 -; GFX9-NEXT: v_mov_b32_e32 v39, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 14 -; GFX9-NEXT: v_mov_b32_e32 v43, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 15 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v15, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 16 -; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 17 -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, s72 +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 18 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 19 -; GFX9-NEXT: v_mov_b32_e32 v54, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 20 -; GFX9-NEXT: v_mov_b32_e32 v61, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 21 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 22 -; GFX9-NEXT: v_mov_b32_e32 v60, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 23 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 24 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 25 -; GFX9-NEXT: v_mov_b32_e32 v23, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 26 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 27 -; GFX9-NEXT: v_mov_b32_e32 v59, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 28 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_mov_b32_e32 v61, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 29 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 30 -; GFX9-NEXT: v_mov_b32_e32 v58, s4 +; GFX9-NEXT: v_mov_b32_e32 v60, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 31 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_mov_b32_e32 v59, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 32 -; GFX9-NEXT: v_mov_b32_e32 v57, s4 +; GFX9-NEXT: v_mov_b32_e32 v53, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 33 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v33, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 34 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_mov_b32_e32 v58, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 35 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_mov_b32_e32 v57, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 36 -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 37 -; GFX9-NEXT: v_mov_b32_e32 v56, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 38 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 39 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 40 -; GFX9-NEXT: v_mov_b32_e32 v47, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 41 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 42 -; GFX9-NEXT: v_mov_b32_e32 v46, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 43 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 44 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 45 -; GFX9-NEXT: v_mov_b32_e32 v45, s4 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 46 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_mov_b32_e32 v56, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 47 -; GFX9-NEXT: v_mov_b32_e32 v44, s4 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 48 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 +; GFX9-NEXT: v_mov_b32_e32 v46, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 49 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s4 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s26 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s28 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s58 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s60 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s62 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s72 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s74 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s76 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v47, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 8 +; GFX9-NEXT: v_mov_b32_e32 v41, s92 +; GFX9-NEXT: v_mov_b32_e32 v37, s4 +; GFX9-NEXT: v_mov_b32_e32 v34, s34 +; GFX9-NEXT: v_mov_b32_e32 v55, s30 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s78 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s88 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_readlane_b32 s4, v62, 6 ; GFX9-NEXT: v_mov_b32_e32 v41, s90 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s92 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s94 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s30 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s34 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, s36 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, s4 +; GFX9-NEXT: v_mov_b32_e32 v41, v46 +; GFX9-NEXT: v_mov_b32_e32 v46, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, s88 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s56 -; GFX9-NEXT: v_mov_b32_e32 v22, s57 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_readlane_b32 s4, v62, 4 +; GFX9-NEXT: v_mov_b32_e32 v56, s78 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, s4 +; GFX9-NEXT: v_mov_b32_e32 v18, s5 +; GFX9-NEXT: v_readlane_b32 s5, v62, 9 +; GFX9-NEXT: v_readlane_b32 s5, v62, 7 +; GFX9-NEXT: v_readlane_b32 s5, v62, 5 +; GFX9-NEXT: v_readlane_b32 s4, v62, 2 +; GFX9-NEXT: v_readlane_b32 s5, v62, 3 +; GFX9-NEXT: v_mov_b32_e32 v43, s4 +; GFX9-NEXT: v_readlane_b32 s4, v62, 0 +; GFX9-NEXT: v_mov_b32_e32 v15, s56 +; GFX9-NEXT: v_mov_b32_e32 v16, s57 ; GFX9-NEXT: v_mov_b32_e32 v13, s46 ; GFX9-NEXT: v_mov_b32_e32 v14, s47 ; GFX9-NEXT: v_mov_b32_e32 v11, s44 @@ -217890,63 +217377,53 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v4, s23 ; GFX9-NEXT: v_mov_b32_e32 v1, s20 ; GFX9-NEXT: v_mov_b32_e32 v2, s21 -; GFX9-NEXT: v_mov_b32_e32 v48, s18 -; GFX9-NEXT: v_mov_b32_e32 v49, s19 -; GFX9-NEXT: v_mov_b32_e32 v37, s16 -; GFX9-NEXT: v_mov_b32_e32 v38, s17 -; GFX9-NEXT: v_mov_b32_e32 v35, s14 -; GFX9-NEXT: v_mov_b32_e32 v36, s15 -; GFX9-NEXT: v_mov_b32_e32 v33, s12 -; GFX9-NEXT: v_mov_b32_e32 v34, s13 -; GFX9-NEXT: v_mov_b32_e32 v31, s10 -; GFX9-NEXT: v_mov_b32_e32 v32, s11 -; GFX9-NEXT: v_mov_b32_e32 v29, s8 -; GFX9-NEXT: v_mov_b32_e32 v30, s9 -; GFX9-NEXT: v_mov_b32_e32 v27, s6 -; GFX9-NEXT: v_mov_b32_e32 v28, s7 -; GFX9-NEXT: v_mov_b32_e32 v26, s5 -; GFX9-NEXT: v_mov_b32_e32 v41, v50 -; GFX9-NEXT: v_mov_b32_e32 v50, v51 -; GFX9-NEXT: v_mov_b32_e32 v51, v52 -; GFX9-NEXT: v_mov_b32_e32 v52, v53 -; GFX9-NEXT: v_mov_b32_e32 v53, v55 -; GFX9-NEXT: v_mov_b32_e32 v55, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, s81 +; GFX9-NEXT: v_mov_b32_e32 v31, s18 +; GFX9-NEXT: v_mov_b32_e32 v32, s19 +; GFX9-NEXT: v_mov_b32_e32 v29, s16 +; GFX9-NEXT: v_mov_b32_e32 v30, s17 +; GFX9-NEXT: v_mov_b32_e32 v27, s14 +; GFX9-NEXT: v_mov_b32_e32 v28, s15 +; GFX9-NEXT: v_mov_b32_e32 v25, s12 +; GFX9-NEXT: v_mov_b32_e32 v26, s13 +; GFX9-NEXT: v_mov_b32_e32 v23, s10 +; GFX9-NEXT: v_mov_b32_e32 v24, s11 +; GFX9-NEXT: v_mov_b32_e32 v21, s8 +; GFX9-NEXT: v_mov_b32_e32 v22, s9 +; GFX9-NEXT: v_mov_b32_e32 v19, s6 +; GFX9-NEXT: v_mov_b32_e32 v20, s7 +; GFX9-NEXT: v_mov_b32_e32 v51, s26 +; GFX9-NEXT: v_readlane_b32 s5, v62, 1 +; GFX9-NEXT: v_mov_b32_e32 v44, s4 +; GFX9-NEXT: v_mov_b32_e32 v36, s48 +; GFX9-NEXT: v_mov_b32_e32 v40, s50 +; GFX9-NEXT: v_mov_b32_e32 v48, s52 +; GFX9-NEXT: v_mov_b32_e32 v49, s54 +; GFX9-NEXT: v_mov_b32_e32 v34, s55 +; GFX9-NEXT: v_mov_b32_e32 v39, s27 +; GFX9-NEXT: v_mov_b32_e32 v54, s29 +; GFX9-NEXT: v_mov_b32_e32 v50, s28 +; GFX9-NEXT: v_mov_b32_e32 v52, s60 +; GFX9-NEXT: v_mov_b32_e32 v55, s61 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, s36 +; GFX9-NEXT: v_mov_b32_e32 v45, s63 +; GFX9-NEXT: v_mov_b32_e32 v56, v57 +; GFX9-NEXT: v_mov_b32_e32 v57, v33 +; GFX9-NEXT: v_mov_b32_e32 v33, v53 +; GFX9-NEXT: v_mov_b32_e32 v53, v59 +; GFX9-NEXT: v_mov_b32_e32 v59, v60 +; GFX9-NEXT: v_mov_b32_e32 v60, v61 +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v37, s49 ; GFX9-NEXT: .LBB99_5: ; %end -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v15 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GFX9-NEXT: v_or_b32_sdwa v16, v37, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v21, v21, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v35, v35, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v37, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v36, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GFX9-NEXT: v_or_b32_sdwa v23, v33, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v58 -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 8, v57 -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GFX9-NEXT: v_or_b32_sdwa v17, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v56 -; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v47 -; GFX9-NEXT: v_or_b32_sdwa v27, v27, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v46 -; GFX9-NEXT: v_or_b32_sdwa v28, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v25, v25, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v44 -; GFX9-NEXT: v_or_b32_sdwa v26, v26, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v42 +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v44 ; GFX9-NEXT: v_readlane_b32 s99, v63, 35 ; GFX9-NEXT: v_readlane_b32 s98, v63, 34 ; GFX9-NEXT: v_readlane_b32 s97, v63, 33 @@ -217983,329 +217460,343 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_readlane_b32 s34, v63, 2 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v15, v38, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GFX9-NEXT: v_or_b32_sdwa v22, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v19, v42, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v43 -; GFX9-NEXT: v_or_b32_sdwa v19, v39, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v40, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v54 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v44, v61, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v36, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v32, v44, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v43 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX9-NEXT: v_or_b32_sdwa v26, v60, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v25, v53, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v24, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v27, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v28, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v35, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v18, v41, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v55, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v52, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v15, v50, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v54 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v51 +; GFX9-NEXT: v_or_b32_sdwa v14, v39, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v49 ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v12, v48, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v37 ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v55 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v53, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v52 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24 -; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload @@ -218323,8 +217814,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -218334,257 +217825,257 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 ; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:88 ; GFX11-NEXT: s_mov_b32 exec_lo, s4 -; GFX11-NEXT: v_writelane_b32 v75, s30, 0 -; GFX11-NEXT: v_writelane_b32 v76, s96, 0 +; GFX11-NEXT: v_writelane_b32 v73, s30, 0 +; GFX11-NEXT: v_writelane_b32 v74, s96, 0 ; GFX11-NEXT: v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15 -; GFX11-NEXT: v_writelane_b32 v75, s31, 1 -; GFX11-NEXT: v_writelane_b32 v76, s97, 1 +; GFX11-NEXT: v_writelane_b32 v73, s31, 1 +; GFX11-NEXT: v_writelane_b32 v74, s97, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_readfirstlane_b32 s40, v16 ; GFX11-NEXT: v_readfirstlane_b32 s41, v17 ; GFX11-NEXT: v_readfirstlane_b32 s28, v1 -; GFX11-NEXT: v_writelane_b32 v75, s34, 2 -; GFX11-NEXT: v_writelane_b32 v76, s98, 2 +; GFX11-NEXT: v_writelane_b32 v73, s34, 2 +; GFX11-NEXT: v_writelane_b32 v74, s98, 2 ; GFX11-NEXT: v_readfirstlane_b32 s29, v2 ; GFX11-NEXT: v_readfirstlane_b32 s14, v3 ; GFX11-NEXT: v_readfirstlane_b32 s15, v4 -; GFX11-NEXT: v_writelane_b32 v75, s35, 3 -; GFX11-NEXT: v_writelane_b32 v76, s99, 3 +; GFX11-NEXT: v_writelane_b32 v73, s35, 3 +; GFX11-NEXT: v_writelane_b32 v74, s99, 3 ; GFX11-NEXT: v_readfirstlane_b32 s12, v5 ; GFX11-NEXT: v_readfirstlane_b32 s13, v6 ; GFX11-NEXT: v_readfirstlane_b32 s10, v7 -; GFX11-NEXT: v_writelane_b32 v75, s36, 4 -; GFX11-NEXT: v_writelane_b32 v76, s100, 4 +; GFX11-NEXT: v_writelane_b32 v73, s36, 4 +; GFX11-NEXT: v_writelane_b32 v74, s100, 4 ; GFX11-NEXT: v_readfirstlane_b32 s11, v8 ; GFX11-NEXT: v_readfirstlane_b32 s8, v9 ; GFX11-NEXT: v_readfirstlane_b32 s9, v10 -; GFX11-NEXT: v_writelane_b32 v75, s37, 5 -; GFX11-NEXT: v_writelane_b32 v76, s101, 5 +; GFX11-NEXT: v_writelane_b32 v73, s37, 5 +; GFX11-NEXT: v_writelane_b32 v74, s101, 5 ; GFX11-NEXT: v_readfirstlane_b32 s6, v11 ; GFX11-NEXT: v_readfirstlane_b32 s7, v12 ; GFX11-NEXT: v_readfirstlane_b32 s4, v13 -; GFX11-NEXT: v_writelane_b32 v75, s38, 6 -; GFX11-NEXT: v_writelane_b32 v76, s102, 6 +; GFX11-NEXT: v_writelane_b32 v73, s38, 6 +; GFX11-NEXT: v_writelane_b32 v74, s102, 6 ; GFX11-NEXT: v_readfirstlane_b32 s5, v14 ; GFX11-NEXT: s_mov_b32 s99, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: v_writelane_b32 v75, s39, 7 -; GFX11-NEXT: v_writelane_b32 v76, s103, 7 -; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 -; GFX11-NEXT: ; implicit-def: $vgpr78 : SGPR spill to VGPR lane -; GFX11-NEXT: ; implicit-def: $vgpr77 : SGPR spill to VGPR lane -; GFX11-NEXT: v_writelane_b32 v75, s48, 8 -; GFX11-NEXT: v_writelane_b32 v76, s104, 8 -; GFX11-NEXT: v_writelane_b32 v75, s49, 9 -; GFX11-NEXT: v_writelane_b32 v75, s50, 10 -; GFX11-NEXT: v_writelane_b32 v75, s51, 11 -; GFX11-NEXT: v_writelane_b32 v75, s52, 12 -; GFX11-NEXT: v_writelane_b32 v75, s53, 13 -; GFX11-NEXT: v_writelane_b32 v75, s54, 14 -; GFX11-NEXT: v_writelane_b32 v75, s55, 15 -; GFX11-NEXT: v_writelane_b32 v75, s64, 16 -; GFX11-NEXT: v_writelane_b32 v75, s65, 17 -; GFX11-NEXT: v_writelane_b32 v75, s66, 18 -; GFX11-NEXT: v_writelane_b32 v75, s67, 19 -; GFX11-NEXT: v_writelane_b32 v75, s68, 20 -; GFX11-NEXT: v_writelane_b32 v75, s69, 21 -; GFX11-NEXT: v_writelane_b32 v75, s70, 22 -; GFX11-NEXT: v_writelane_b32 v75, s71, 23 -; GFX11-NEXT: v_writelane_b32 v75, s80, 24 -; GFX11-NEXT: v_writelane_b32 v75, s81, 25 -; GFX11-NEXT: v_writelane_b32 v75, s82, 26 -; GFX11-NEXT: v_writelane_b32 v75, s83, 27 -; GFX11-NEXT: v_writelane_b32 v75, s84, 28 -; GFX11-NEXT: v_writelane_b32 v75, s85, 29 -; GFX11-NEXT: v_writelane_b32 v75, s86, 30 -; GFX11-NEXT: v_writelane_b32 v75, s87, 31 +; GFX11-NEXT: v_writelane_b32 v73, s39, 7 +; GFX11-NEXT: v_writelane_b32 v74, s103, 7 +; GFX11-NEXT: s_clause 0x10 ; 68-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 +; GFX11-NEXT: ; implicit-def: $vgpr76 : SGPR spill to VGPR lane +; GFX11-NEXT: ; implicit-def: $vgpr75 : SGPR spill to VGPR lane +; GFX11-NEXT: v_writelane_b32 v73, s48, 8 +; GFX11-NEXT: v_writelane_b32 v74, s104, 8 +; GFX11-NEXT: v_writelane_b32 v73, s49, 9 +; GFX11-NEXT: v_writelane_b32 v73, s50, 10 +; GFX11-NEXT: v_writelane_b32 v73, s51, 11 +; GFX11-NEXT: v_writelane_b32 v73, s52, 12 +; GFX11-NEXT: v_writelane_b32 v73, s53, 13 +; GFX11-NEXT: v_writelane_b32 v73, s54, 14 +; GFX11-NEXT: v_writelane_b32 v73, s55, 15 +; GFX11-NEXT: v_writelane_b32 v73, s64, 16 +; GFX11-NEXT: v_writelane_b32 v73, s65, 17 +; GFX11-NEXT: v_writelane_b32 v73, s66, 18 +; GFX11-NEXT: v_writelane_b32 v73, s67, 19 +; GFX11-NEXT: v_writelane_b32 v73, s68, 20 +; GFX11-NEXT: v_writelane_b32 v73, s69, 21 +; GFX11-NEXT: v_writelane_b32 v73, s70, 22 +; GFX11-NEXT: v_writelane_b32 v73, s71, 23 +; GFX11-NEXT: v_writelane_b32 v73, s80, 24 +; GFX11-NEXT: v_writelane_b32 v73, s81, 25 +; GFX11-NEXT: v_writelane_b32 v73, s82, 26 +; GFX11-NEXT: v_writelane_b32 v73, s83, 27 +; GFX11-NEXT: v_writelane_b32 v73, s84, 28 +; GFX11-NEXT: v_writelane_b32 v73, s85, 29 +; GFX11-NEXT: v_writelane_b32 v73, s86, 30 +; GFX11-NEXT: v_writelane_b32 v73, s87, 31 ; GFX11-NEXT: s_cbranch_scc0 .LBB99_3 ; GFX11-NEXT: ; %bb.1: ; %cmp.false ; GFX11-NEXT: s_lshr_b32 s42, s27, 16 ; GFX11-NEXT: s_lshr_b64 s[74:75], s[0:1], 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 8 +; GFX11-NEXT: v_writelane_b32 v75, s42, 8 ; GFX11-NEXT: s_lshr_b32 s42, s27, 8 ; GFX11-NEXT: s_lshr_b32 s43, s27, 24 ; GFX11-NEXT: s_lshr_b32 s34, s5, 24 ; GFX11-NEXT: s_lshr_b32 s35, s5, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 7 +; GFX11-NEXT: v_writelane_b32 v75, s42, 7 ; GFX11-NEXT: s_lshr_b32 s42, s26, 16 ; GFX11-NEXT: s_lshr_b32 s37, s5, 8 ; GFX11-NEXT: s_lshr_b32 s36, s4, 16 ; GFX11-NEXT: s_lshr_b32 s38, s4, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 6 +; GFX11-NEXT: v_writelane_b32 v75, s42, 6 ; GFX11-NEXT: s_lshr_b32 s42, s26, 8 ; GFX11-NEXT: s_lshr_b32 s39, s7, 24 ; GFX11-NEXT: s_lshr_b32 s48, s7, 16 ; GFX11-NEXT: s_lshr_b32 s50, s7, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 5 +; GFX11-NEXT: v_writelane_b32 v75, s42, 5 ; GFX11-NEXT: s_lshr_b32 s42, s25, 24 ; GFX11-NEXT: s_lshr_b32 s49, s6, 16 ; GFX11-NEXT: s_lshr_b32 s51, s6, 8 ; GFX11-NEXT: s_lshr_b32 s52, s9, 24 -; GFX11-NEXT: v_writelane_b32 v77, s42, 4 +; GFX11-NEXT: v_writelane_b32 v75, s42, 4 ; GFX11-NEXT: s_lshr_b32 s42, s25, 16 ; GFX11-NEXT: s_lshr_b32 s53, s9, 16 ; GFX11-NEXT: s_lshr_b32 s55, s9, 8 ; GFX11-NEXT: s_lshr_b32 s54, s8, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 3 +; GFX11-NEXT: v_writelane_b32 v75, s42, 3 ; GFX11-NEXT: s_lshr_b32 s42, s25, 8 ; GFX11-NEXT: s_lshr_b32 s64, s8, 8 ; GFX11-NEXT: s_lshr_b32 s65, s11, 24 ; GFX11-NEXT: s_lshr_b32 s66, s11, 16 -; GFX11-NEXT: v_writelane_b32 v77, s42, 2 +; GFX11-NEXT: v_writelane_b32 v75, s42, 2 ; GFX11-NEXT: s_lshr_b32 s42, s24, 16 ; GFX11-NEXT: s_lshr_b32 s68, s11, 8 ; GFX11-NEXT: s_lshr_b32 s67, s10, 16 ; GFX11-NEXT: s_lshr_b32 s69, s10, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 1 +; GFX11-NEXT: v_writelane_b32 v75, s42, 1 ; GFX11-NEXT: s_lshr_b32 s42, s24, 8 ; GFX11-NEXT: s_lshr_b32 s70, s13, 24 ; GFX11-NEXT: s_lshr_b32 s71, s13, 16 ; GFX11-NEXT: s_lshr_b32 s81, s13, 8 -; GFX11-NEXT: v_writelane_b32 v77, s42, 0 +; GFX11-NEXT: v_writelane_b32 v75, s42, 0 ; GFX11-NEXT: s_lshr_b32 s42, s23, 24 ; GFX11-NEXT: s_lshr_b32 s80, s12, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 31 +; GFX11-NEXT: v_writelane_b32 v76, s42, 31 ; GFX11-NEXT: s_lshr_b32 s42, s23, 16 ; GFX11-NEXT: s_lshr_b32 s82, s12, 8 ; GFX11-NEXT: s_lshr_b32 s83, s15, 24 ; GFX11-NEXT: s_lshr_b32 s84, s15, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 30 +; GFX11-NEXT: v_writelane_b32 v76, s42, 30 ; GFX11-NEXT: s_lshr_b32 s42, s23, 8 ; GFX11-NEXT: s_lshr_b32 s86, s15, 8 ; GFX11-NEXT: s_lshr_b32 s85, s14, 16 ; GFX11-NEXT: s_lshr_b32 s87, s14, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 29 +; GFX11-NEXT: v_writelane_b32 v76, s42, 29 ; GFX11-NEXT: s_lshr_b32 s42, s22, 16 ; GFX11-NEXT: s_lshr_b32 s96, s29, 24 ; GFX11-NEXT: s_lshr_b32 s97, s29, 16 ; GFX11-NEXT: s_lshr_b32 s100, s29, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 28 +; GFX11-NEXT: v_writelane_b32 v76, s42, 28 ; GFX11-NEXT: s_lshr_b32 s42, s22, 8 ; GFX11-NEXT: s_lshr_b32 s98, s28, 16 ; GFX11-NEXT: s_lshr_b32 s101, s28, 8 ; GFX11-NEXT: s_lshr_b32 s102, s41, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 27 +; GFX11-NEXT: v_writelane_b32 v76, s42, 27 ; GFX11-NEXT: s_lshr_b32 s42, s21, 24 ; GFX11-NEXT: s_lshr_b32 s103, s41, 16 ; GFX11-NEXT: s_lshr_b32 vcc_hi, s41, 8 ; GFX11-NEXT: s_lshr_b32 s104, s40, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 26 +; GFX11-NEXT: v_writelane_b32 v76, s42, 26 ; GFX11-NEXT: s_lshr_b32 s42, s21, 16 -; GFX11-NEXT: s_lshr_b64 s[62:63], s[26:27], 24 -; GFX11-NEXT: s_lshr_b64 s[72:73], s[24:25], 24 +; GFX11-NEXT: s_lshr_b64 s[72:73], s[26:27], 24 +; GFX11-NEXT: s_lshr_b64 s[62:63], s[24:25], 24 ; GFX11-NEXT: s_lshr_b64 s[60:61], s[22:23], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 25 +; GFX11-NEXT: v_writelane_b32 v76, s42, 25 ; GFX11-NEXT: s_lshr_b32 s42, s21, 8 ; GFX11-NEXT: s_lshr_b64 s[58:59], s[20:21], 24 ; GFX11-NEXT: s_lshr_b64 s[56:57], s[18:19], 24 ; GFX11-NEXT: s_lshr_b64 s[46:47], s[16:17], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 24 +; GFX11-NEXT: v_writelane_b32 v76, s42, 24 ; GFX11-NEXT: s_lshr_b32 s42, s20, 16 ; GFX11-NEXT: s_lshr_b64 s[44:45], s[2:3], 24 -; GFX11-NEXT: s_lshr_b64 s[76:77], s[6:7], 24 -; GFX11-NEXT: s_lshr_b64 s[78:79], s[8:9], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 23 +; GFX11-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 +; GFX11-NEXT: s_lshr_b64 s[94:95], s[6:7], 24 +; GFX11-NEXT: v_writelane_b32 v76, s42, 23 ; GFX11-NEXT: s_lshr_b32 s42, s20, 8 -; GFX11-NEXT: s_lshr_b64 s[88:89], s[10:11], 24 -; GFX11-NEXT: s_lshr_b64 s[90:91], s[12:13], 24 -; GFX11-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 22 +; GFX11-NEXT: s_lshr_b64 s[92:93], s[8:9], 24 +; GFX11-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 +; GFX11-NEXT: s_lshr_b64 s[88:89], s[12:13], 24 +; GFX11-NEXT: v_writelane_b32 v76, s42, 22 ; GFX11-NEXT: s_lshr_b32 s42, s19, 24 -; GFX11-NEXT: s_lshr_b64 s[94:95], s[28:29], 24 -; GFX11-NEXT: s_lshr_b64 s[30:31], s[40:41], 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 21 +; GFX11-NEXT: s_lshr_b64 s[78:79], s[14:15], 24 +; GFX11-NEXT: s_lshr_b64 s[76:77], s[28:29], 24 +; GFX11-NEXT: v_writelane_b32 v76, s42, 21 ; GFX11-NEXT: s_lshr_b32 s42, s19, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 20 +; GFX11-NEXT: v_writelane_b32 v76, s42, 20 ; GFX11-NEXT: s_lshr_b32 s42, s19, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 19 +; GFX11-NEXT: v_writelane_b32 v76, s42, 19 ; GFX11-NEXT: s_lshr_b32 s42, s18, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 18 +; GFX11-NEXT: v_writelane_b32 v76, s42, 18 ; GFX11-NEXT: s_lshr_b32 s42, s18, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 17 +; GFX11-NEXT: v_writelane_b32 v76, s42, 17 ; GFX11-NEXT: s_lshr_b32 s42, s17, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 16 +; GFX11-NEXT: v_writelane_b32 v76, s42, 16 ; GFX11-NEXT: s_lshr_b32 s42, s17, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 15 +; GFX11-NEXT: v_writelane_b32 v76, s42, 15 ; GFX11-NEXT: s_lshr_b32 s42, s17, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 14 +; GFX11-NEXT: v_writelane_b32 v76, s42, 14 ; GFX11-NEXT: s_lshr_b32 s42, s16, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 13 +; GFX11-NEXT: v_writelane_b32 v76, s42, 13 ; GFX11-NEXT: s_lshr_b32 s42, s16, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 12 +; GFX11-NEXT: v_writelane_b32 v76, s42, 12 ; GFX11-NEXT: s_lshr_b32 s42, s3, 24 -; GFX11-NEXT: v_writelane_b32 v78, s42, 11 +; GFX11-NEXT: v_writelane_b32 v76, s42, 11 ; GFX11-NEXT: s_lshr_b32 s42, s3, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 10 +; GFX11-NEXT: v_writelane_b32 v76, s42, 10 ; GFX11-NEXT: s_lshr_b32 s42, s3, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 9 +; GFX11-NEXT: v_writelane_b32 v76, s42, 9 ; GFX11-NEXT: s_lshr_b32 s42, s2, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 8 +; GFX11-NEXT: v_writelane_b32 v76, s42, 8 ; GFX11-NEXT: s_lshr_b32 s42, s2, 8 -; GFX11-NEXT: v_writelane_b32 v78, s42, 7 +; GFX11-NEXT: v_writelane_b32 v76, s42, 7 ; GFX11-NEXT: s_lshr_b32 s42, s1, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 6 +; GFX11-NEXT: v_writelane_b32 v76, s42, 6 ; GFX11-NEXT: s_lshr_b32 s42, s1, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 5 +; GFX11-NEXT: v_writelane_b32 v76, s42, 5 ; GFX11-NEXT: s_lshr_b32 s42, s1, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 4 +; GFX11-NEXT: v_writelane_b32 v76, s42, 4 ; GFX11-NEXT: s_lshr_b32 s42, s0, 16 -; GFX11-NEXT: v_writelane_b32 v78, s42, 3 +; GFX11-NEXT: v_writelane_b32 v76, s42, 3 ; GFX11-NEXT: s_lshr_b32 s42, s0, 8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_writelane_b32 v78, s42, 2 +; GFX11-NEXT: v_writelane_b32 v76, s42, 2 ; GFX11-NEXT: s_lshr_b32 s42, s40, 8 -; GFX11-NEXT: v_writelane_b32 v78, s74, 0 -; GFX11-NEXT: v_writelane_b32 v78, s75, 1 -; GFX11-NEXT: s_lshr_b64 s[74:75], s[4:5], 24 +; GFX11-NEXT: v_writelane_b32 v76, s74, 0 +; GFX11-NEXT: v_writelane_b32 v76, s75, 1 +; GFX11-NEXT: s_lshr_b64 s[74:75], s[40:41], 24 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s99 ; GFX11-NEXT: s_cbranch_vccnz .LBB99_4 ; GFX11-NEXT: .LBB99_2: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v39, s17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v38, s16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, s5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, s4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v51, s3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v50, s2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v33, s21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, s24, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v29, s23, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v28, s22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v33, s21, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v32, s20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, s5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, s4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v53, s3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v52, s2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v37, s19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v36, s18, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v16, s41, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v15, s40, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, s29, 3 op_sel_hi:[1,0] @@ -218599,110 +218090,108 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v3, s6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v53, s1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v52, s0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v37, s19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, s25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v55, s1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v54, s0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v39, s17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v38, s16, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v21, s27, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v20, s26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, s24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v36, s18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_lshrrev_b64 v[80:81], 24, v[38:39] -; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[28:29] -; GFX11-NEXT: v_lshrrev_b64 v[67:68], 24, v[32:33] -; GFX11-NEXT: v_lshrrev_b64 v[81:82], 24, v[50:51] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[24:25] +; GFX11-NEXT: v_lshrrev_b64 v[67:68], 24, v[28:29] +; GFX11-NEXT: v_lshrrev_b64 v[68:69], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[69:70], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53] ; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] -; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[20:21] -; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[24:25] -; GFX11-NEXT: v_lshrrev_b64 v[70:71], 24, v[36:37] -; GFX11-NEXT: v_lshrrev_b64 v[82:83], 24, v[52:53] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[20:21] +; GFX11-NEXT: v_lshrrev_b64 v[70:71], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b64 v[81:82], 24, v[54:55] ; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] ; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX11-NEXT: v_lshrrev_b64 v[26:27], 24, v[7:8] ; GFX11-NEXT: v_lshrrev_b64 v[30:31], 24, v[9:10] ; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12] -; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[13:14] -; GFX11-NEXT: v_lshrrev_b64 v[68:69], 24, v[15:16] -; GFX11-NEXT: v_lshrrev_b32_e32 v147, 24, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v148, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v149, 8, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v150, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v151, 8, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v161, 24, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v160, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v162, 8, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v163, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v164, 8, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v165, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v167, 8, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v177, 8, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v179, 24, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v178, 16, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v180, 8, v33 -; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v182, 8, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v183, 16, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v41, 8, v37 -; GFX11-NEXT: v_lshrrev_b32_e32 v42, 16, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v43, 8, v36 -; GFX11-NEXT: v_lshrrev_b32_e32 v45, 24, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v44, 16, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v46, 8, v39 -; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v56, 8, v38 -; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v51 -; GFX11-NEXT: v_lshrrev_b32_e32 v57, 16, v51 -; GFX11-NEXT: v_lshrrev_b32_e32 v59, 8, v51 -; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v60, 8, v50 -; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v62, 16, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v72, 8, v53 -; GFX11-NEXT: v_lshrrev_b32_e32 v73, 16, v52 -; GFX11-NEXT: v_lshrrev_b32_e32 v74, 8, v52 +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 24, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 24, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 24, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 24, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 16, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 8, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 24, v53 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v53 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v53 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 16, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 8, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 24, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 16, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 16, v54 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 8, v54 ; GFX11-NEXT: v_lshrrev_b32_e32 v19, 24, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v97, 8, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v98, 24, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v99, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v100, 8, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v102, 8, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v103, 24, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v112, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v113, 8, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v114, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v115, 8, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v116, 24, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v117, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v118, 8, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v128, 8, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v129, 24, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v130, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v131, 8, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v133, 8, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v134, 24, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v146, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v15 ; GFX11-NEXT: s_branch .LBB99_5 ; GFX11-NEXT: .LBB99_3: ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -218715,57 +218204,57 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr56 ; GFX11-NEXT: ; implicit-def: $sgpr58 ; GFX11-NEXT: ; implicit-def: $sgpr60 +; GFX11-NEXT: ; implicit-def: $sgpr62 +; GFX11-NEXT: ; implicit-def: $sgpr72 ; GFX11-NEXT: ; implicit-def: $sgpr104 +; GFX11-NEXT: ; implicit-def: $sgpr74 ; GFX11-NEXT: ; implicit-def: $vcc_hi ; GFX11-NEXT: ; implicit-def: $sgpr103 ; GFX11-NEXT: ; implicit-def: $sgpr102 ; GFX11-NEXT: ; implicit-def: $sgpr101 ; GFX11-NEXT: ; implicit-def: $sgpr98 +; GFX11-NEXT: ; implicit-def: $sgpr76 ; GFX11-NEXT: ; implicit-def: $sgpr100 ; GFX11-NEXT: ; implicit-def: $sgpr97 ; GFX11-NEXT: ; implicit-def: $sgpr96 ; GFX11-NEXT: ; implicit-def: $sgpr87 ; GFX11-NEXT: ; implicit-def: $sgpr85 +; GFX11-NEXT: ; implicit-def: $sgpr78 ; GFX11-NEXT: ; implicit-def: $sgpr86 ; GFX11-NEXT: ; implicit-def: $sgpr84 ; GFX11-NEXT: ; implicit-def: $sgpr83 ; GFX11-NEXT: ; implicit-def: $sgpr82 ; GFX11-NEXT: ; implicit-def: $sgpr80 +; GFX11-NEXT: ; implicit-def: $sgpr88 ; GFX11-NEXT: ; implicit-def: $sgpr81 ; GFX11-NEXT: ; implicit-def: $sgpr71 ; GFX11-NEXT: ; implicit-def: $sgpr70 ; GFX11-NEXT: ; implicit-def: $sgpr69 ; GFX11-NEXT: ; implicit-def: $sgpr67 +; GFX11-NEXT: ; implicit-def: $sgpr90 ; GFX11-NEXT: ; implicit-def: $sgpr68 ; GFX11-NEXT: ; implicit-def: $sgpr66 ; GFX11-NEXT: ; implicit-def: $sgpr65 ; GFX11-NEXT: ; implicit-def: $sgpr64 ; GFX11-NEXT: ; implicit-def: $sgpr54 +; GFX11-NEXT: ; implicit-def: $sgpr92 ; GFX11-NEXT: ; implicit-def: $sgpr55 ; GFX11-NEXT: ; implicit-def: $sgpr53 ; GFX11-NEXT: ; implicit-def: $sgpr52 ; GFX11-NEXT: ; implicit-def: $sgpr51 ; GFX11-NEXT: ; implicit-def: $sgpr49 +; GFX11-NEXT: ; implicit-def: $sgpr94 ; GFX11-NEXT: ; implicit-def: $sgpr50 ; GFX11-NEXT: ; implicit-def: $sgpr48 ; GFX11-NEXT: ; implicit-def: $sgpr39 ; GFX11-NEXT: ; implicit-def: $sgpr38 ; GFX11-NEXT: ; implicit-def: $sgpr36 +; GFX11-NEXT: ; implicit-def: $sgpr30 ; GFX11-NEXT: ; implicit-def: $sgpr37 ; GFX11-NEXT: ; implicit-def: $sgpr35 ; GFX11-NEXT: ; implicit-def: $sgpr34 -; GFX11-NEXT: ; implicit-def: $sgpr72 -; GFX11-NEXT: ; implicit-def: $sgpr62 -; GFX11-NEXT: ; implicit-def: $sgpr30 -; GFX11-NEXT: ; implicit-def: $sgpr94 -; GFX11-NEXT: ; implicit-def: $sgpr92 -; GFX11-NEXT: ; implicit-def: $sgpr90 -; GFX11-NEXT: ; implicit-def: $sgpr88 -; GFX11-NEXT: ; implicit-def: $sgpr78 -; GFX11-NEXT: ; implicit-def: $sgpr76 -; GFX11-NEXT: ; implicit-def: $sgpr74 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-NEXT: v_writelane_b32 v76, s42, 0 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -218776,7 +218265,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: v_writelane_b32 v78, s43, 1 +; GFX11-NEXT: v_writelane_b32 v76, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr43 @@ -218845,295 +218334,295 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: s_branch .LBB99_2 ; GFX11-NEXT: .LBB99_4: -; GFX11-NEXT: v_dual_mov_b32 v52, s0 :: v_dual_mov_b32 v53, s1 -; GFX11-NEXT: v_readlane_b32 s0, v78, 2 -; GFX11-NEXT: v_mov_b32_e32 v71, s50 +; GFX11-NEXT: v_dual_mov_b32 v54, s0 :: v_dual_mov_b32 v55, s1 +; GFX11-NEXT: v_readlane_b32 s0, v76, 2 +; GFX11-NEXT: v_mov_b32_e32 v65, s50 ; GFX11-NEXT: v_dual_mov_b32 v15, s40 :: v_dual_mov_b32 v16, s41 ; GFX11-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v14, s29 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v74, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 3 +; GFX11-NEXT: v_mov_b32_e32 v72, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 3 ; GFX11-NEXT: v_dual_mov_b32 v11, s14 :: v_dual_mov_b32 v12, s15 ; GFX11-NEXT: v_dual_mov_b32 v9, s12 :: v_dual_mov_b32 v10, s13 -; GFX11-NEXT: v_mov_b32_e32 v73, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 4 -; GFX11-NEXT: v_mov_b32_e32 v55, s48 +; GFX11-NEXT: v_mov_b32_e32 v63, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 4 +; GFX11-NEXT: v_mov_b32_e32 v51, s48 ; GFX11-NEXT: v_dual_mov_b32 v7, s10 :: v_dual_mov_b32 v8, s11 ; GFX11-NEXT: v_dual_mov_b32 v5, s8 :: v_dual_mov_b32 v6, s9 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v72, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 5 +; GFX11-NEXT: v_mov_b32_e32 v62, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 5 ; GFX11-NEXT: v_mov_b32_e32 v49, s39 ; GFX11-NEXT: v_dual_mov_b32 v3, s6 :: v_dual_mov_b32 v4, s7 ; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5 -; GFX11-NEXT: v_mov_b32_e32 v62, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 6 -; GFX11-NEXT: v_dual_mov_b32 v50, s2 :: v_dual_mov_b32 v51, s3 +; GFX11-NEXT: v_mov_b32_e32 v60, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 6 +; GFX11-NEXT: v_dual_mov_b32 v52, s2 :: v_dual_mov_b32 v53, s3 ; GFX11-NEXT: v_dual_mov_b32 v38, s16 :: v_dual_mov_b32 v39, s17 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v63, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 7 -; GFX11-NEXT: v_dual_mov_b32 v35, s38 :: v_dual_mov_b32 v36, s18 -; GFX11-NEXT: v_dual_mov_b32 v37, s19 :: v_dual_mov_b32 v32, s20 -; GFX11-NEXT: v_dual_mov_b32 v33, s21 :: v_dual_mov_b32 v60, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 8 -; GFX11-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_mov_b32 v29, s23 -; GFX11-NEXT: v_dual_mov_b32 v24, s24 :: v_dual_mov_b32 v25, s25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v61, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 9 -; GFX11-NEXT: v_dual_mov_b32 v20, s26 :: v_dual_mov_b32 v21, s27 -; GFX11-NEXT: v_dual_mov_b32 v146, s42 :: v_dual_mov_b32 v145, s104 +; GFX11-NEXT: v_readlane_b32 s0, v76, 7 +; GFX11-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v37, s19 +; GFX11-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21 ; GFX11-NEXT: v_mov_b32_e32 v59, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 10 -; GFX11-NEXT: v_dual_mov_b32 v144, vcc_hi :: v_dual_mov_b32 v135, s103 -; GFX11-NEXT: v_dual_mov_b32 v134, s102 :: v_dual_mov_b32 v133, s101 +; GFX11-NEXT: v_readlane_b32 s0, v76, 8 +; GFX11-NEXT: v_dual_mov_b32 v35, s38 :: v_dual_mov_b32 v28, s22 +; GFX11-NEXT: v_dual_mov_b32 v29, s23 :: v_dual_mov_b32 v24, s24 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v25, s25 :: v_dual_mov_b32 v58, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 9 +; GFX11-NEXT: v_dual_mov_b32 v20, s26 :: v_dual_mov_b32 v21, s27 +; GFX11-NEXT: v_dual_mov_b32 v144, s42 :: v_dual_mov_b32 v135, s104 ; GFX11-NEXT: v_mov_b32_e32 v57, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 11 -; GFX11-NEXT: v_dual_mov_b32 v31, s36 :: v_dual_mov_b32 v132, s98 -; GFX11-NEXT: v_dual_mov_b32 v131, s100 :: v_dual_mov_b32 v130, s97 -; GFX11-NEXT: v_dual_mov_b32 v129, s96 :: v_dual_mov_b32 v58, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 12 -; GFX11-NEXT: v_dual_mov_b32 v27, s37 :: v_dual_mov_b32 v128, s87 -; GFX11-NEXT: v_dual_mov_b32 v119, s85 :: v_dual_mov_b32 v118, s86 +; GFX11-NEXT: v_readlane_b32 s0, v76, 10 +; GFX11-NEXT: v_dual_mov_b32 v134, vcc_hi :: v_dual_mov_b32 v133, s103 +; GFX11-NEXT: v_dual_mov_b32 v132, s102 :: v_dual_mov_b32 v131, s101 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v117, s84 :: v_dual_mov_b32 v56, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 13 -; GFX11-NEXT: v_dual_mov_b32 v116, s83 :: v_dual_mov_b32 v115, s82 -; GFX11-NEXT: v_dual_mov_b32 v114, s80 :: v_dual_mov_b32 v113, s81 ; GFX11-NEXT: v_mov_b32_e32 v47, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 14 -; GFX11-NEXT: v_dual_mov_b32 v23, s35 :: v_dual_mov_b32 v112, s71 -; GFX11-NEXT: v_dual_mov_b32 v103, s70 :: v_dual_mov_b32 v102, s69 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v101, s67 :: v_dual_mov_b32 v46, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 15 -; GFX11-NEXT: v_dual_mov_b32 v19, s34 :: v_dual_mov_b32 v100, s68 -; GFX11-NEXT: v_dual_mov_b32 v99, s66 :: v_dual_mov_b32 v98, s65 -; GFX11-NEXT: v_dual_mov_b32 v97, s64 :: v_dual_mov_b32 v44, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 16 -; GFX11-NEXT: v_dual_mov_b32 v96, s54 :: v_dual_mov_b32 v87, s55 -; GFX11-NEXT: v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v85, s52 +; GFX11-NEXT: v_readlane_b32 s0, v76, 11 +; GFX11-NEXT: v_dual_mov_b32 v31, s36 :: v_dual_mov_b32 v130, s98 +; GFX11-NEXT: v_dual_mov_b32 v129, s100 :: v_dual_mov_b32 v128, s97 +; GFX11-NEXT: v_dual_mov_b32 v119, s96 :: v_dual_mov_b32 v56, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 12 +; GFX11-NEXT: v_dual_mov_b32 v27, s37 :: v_dual_mov_b32 v118, s87 +; GFX11-NEXT: v_dual_mov_b32 v117, s85 :: v_dual_mov_b32 v116, s86 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v115, s84 :: v_dual_mov_b32 v46, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 13 +; GFX11-NEXT: v_dual_mov_b32 v114, s83 :: v_dual_mov_b32 v113, s82 +; GFX11-NEXT: v_dual_mov_b32 v112, s80 :: v_dual_mov_b32 v103, s81 ; GFX11-NEXT: v_mov_b32_e32 v45, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 17 -; GFX11-NEXT: v_dual_mov_b32 v84, s51 :: v_dual_mov_b32 v83, s49 -; GFX11-NEXT: v_dual_mov_b32 v147, s43 :: v_dual_mov_b32 v22, s78 +; GFX11-NEXT: v_readlane_b32 s0, v76, 14 +; GFX11-NEXT: v_dual_mov_b32 v23, s35 :: v_dual_mov_b32 v102, s71 +; GFX11-NEXT: v_dual_mov_b32 v101, s70 :: v_dual_mov_b32 v100, s69 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v99, s67 :: v_dual_mov_b32 v44, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 15 +; GFX11-NEXT: v_dual_mov_b32 v19, s34 :: v_dual_mov_b32 v98, s68 +; GFX11-NEXT: v_dual_mov_b32 v97, s66 :: v_dual_mov_b32 v96, s65 +; GFX11-NEXT: v_dual_mov_b32 v87, s64 :: v_dual_mov_b32 v42, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 16 +; GFX11-NEXT: v_dual_mov_b32 v86, s54 :: v_dual_mov_b32 v85, s55 +; GFX11-NEXT: v_dual_mov_b32 v84, s53 :: v_dual_mov_b32 v83, s52 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v43, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 18 -; GFX11-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88 -; GFX11-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v42, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 19 -; GFX11-NEXT: v_readlane_b32 s1, v78, 1 -; GFX11-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92 -; GFX11-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30 +; GFX11-NEXT: v_readlane_b32 s0, v76, 17 +; GFX11-NEXT: v_dual_mov_b32 v82, s51 :: v_dual_mov_b32 v71, s49 +; GFX11-NEXT: v_dual_mov_b32 v145, s43 :: v_dual_mov_b32 v26, s90 ; GFX11-NEXT: v_mov_b32_e32 v41, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 20 -; GFX11-NEXT: v_mov_b32_e32 v48, s62 -; GFX11-NEXT: v_mov_b32_e32 v54, s72 -; GFX11-NEXT: v_mov_b32_e32 v64, s60 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v70, s56 :: v_dual_mov_b32 v183, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 21 -; GFX11-NEXT: v_mov_b32_e32 v80, s46 -; GFX11-NEXT: v_mov_b32_e32 v18, s76 +; GFX11-NEXT: v_readlane_b32 s0, v76, 18 +; GFX11-NEXT: v_dual_mov_b32 v67, s60 :: v_dual_mov_b32 v30, s88 +; GFX11-NEXT: v_dual_mov_b32 v69, s56 :: v_dual_mov_b32 v34, s78 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v40, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 22 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v76, 19 +; GFX11-NEXT: v_readlane_b32 s1, v76, 1 +; GFX11-NEXT: v_dual_mov_b32 v17, s30 :: v_dual_mov_b32 v64, s74 +; GFX11-NEXT: v_dual_mov_b32 v50, s72 :: v_dual_mov_b32 v183, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 20 +; GFX11-NEXT: v_mov_b32_e32 v66, s62 +; GFX11-NEXT: v_mov_b32_e32 v68, s58 +; GFX11-NEXT: v_mov_b32_e32 v70, s46 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_mov_b32 v80, s44 :: v_dual_mov_b32 v181, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 21 +; GFX11-NEXT: v_mov_b32_e32 v18, s94 +; GFX11-NEXT: v_mov_b32_e32 v22, s92 +; GFX11-NEXT: v_mov_b32_e32 v48, s76 ; GFX11-NEXT: v_mov_b32_e32 v182, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 23 -; GFX11-NEXT: v_mov_b32_e32 v181, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 24 +; GFX11-NEXT: v_readlane_b32 s0, v76, 22 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v180, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 25 -; GFX11-NEXT: v_mov_b32_e32 v178, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v76, 23 ; GFX11-NEXT: v_mov_b32_e32 v179, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 27 -; GFX11-NEXT: v_mov_b32_e32 v177, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 28 +; GFX11-NEXT: v_readlane_b32 s0, v76, 24 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v178, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 25 ; GFX11-NEXT: v_mov_b32_e32 v176, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 29 +; GFX11-NEXT: v_readlane_b32 s0, v76, 26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v177, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 27 ; GFX11-NEXT: v_mov_b32_e32 v167, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 30 +; GFX11-NEXT: v_readlane_b32 s0, v76, 28 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v165, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 31 ; GFX11-NEXT: v_mov_b32_e32 v166, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 29 +; GFX11-NEXT: v_mov_b32_e32 v165, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 30 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v164, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 1 ; GFX11-NEXT: v_mov_b32_e32 v163, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 2 +; GFX11-NEXT: v_readlane_b32 s0, v76, 31 +; GFX11-NEXT: v_mov_b32_e32 v164, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v162, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 3 -; GFX11-NEXT: v_mov_b32_e32 v160, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s0, v75, 1 ; GFX11-NEXT: v_mov_b32_e32 v161, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 5 -; GFX11-NEXT: v_mov_b32_e32 v151, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 6 +; GFX11-NEXT: v_readlane_b32 s0, v75, 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v160, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 3 ; GFX11-NEXT: v_mov_b32_e32 v150, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 7 +; GFX11-NEXT: v_readlane_b32 s0, v75, 4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v151, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 5 ; GFX11-NEXT: v_mov_b32_e32 v149, s0 -; GFX11-NEXT: v_readlane_b32 s0, v77, 8 +; GFX11-NEXT: v_readlane_b32 s0, v75, 6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v148, s0 -; GFX11-NEXT: v_readlane_b32 s0, v78, 0 -; GFX11-NEXT: v_mov_b32_e32 v82, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 7 +; GFX11-NEXT: v_mov_b32_e32 v147, s0 +; GFX11-NEXT: v_readlane_b32 s0, v75, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v146, s0 +; GFX11-NEXT: v_readlane_b32 s0, v76, 0 +; GFX11-NEXT: v_mov_b32_e32 v81, s0 ; GFX11-NEXT: .LBB99_5: ; %end -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v74 -; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v52 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v82 -; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v63 +; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v54 +; GFX11-NEXT: v_lshlrev_b32_e32 v72, 8, v72 +; GFX11-NEXT: v_and_b32_e32 v63, 0xff, v63 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v81 -; GFX11-NEXT: v_or_b32_e32 v52, v52, v69 -; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v73 -; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v50 -; GFX11-NEXT: v_and_b32_e32 v57, 0xff, v57 -; GFX11-NEXT: v_lshlrev_b32_e32 v58, 8, v58 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v52 -; GFX11-NEXT: v_or_b32_e32 v66, v69, v66 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v72 -; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v62, 8, v62 +; GFX11-NEXT: v_or_b32_e32 v54, v54, v72 +; GFX11-NEXT: v_and_b32_e32 v60, 0xff, v60 +; GFX11-NEXT: v_or_b32_e32 v81, v63, v81 +; GFX11-NEXT: v_lshlrev_b32_e32 v61, 8, v61 +; GFX11-NEXT: v_or_b32_e32 v55, v55, v62 +; GFX11-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-NEXT: v_lshlrev_b32_e32 v59, 8, v59 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v81 +; GFX11-NEXT: v_or_b32_e32 v60, v60, v61 +; GFX11-NEXT: v_and_b32_e32 v61, 0xff, v52 +; GFX11-NEXT: v_and_b32_e32 v58, 0xff, v58 ; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v80 -; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v66 -; GFX11-NEXT: v_or_b32_e32 v53, v53, v69 -; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v62 +; GFX11-NEXT: v_or_b32_e32 v52, v54, v81 +; GFX11-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v60 +; GFX11-NEXT: v_or_b32_e32 v81, v61, v59 +; GFX11-NEXT: v_or_b32_e32 v80, v58, v80 +; GFX11-NEXT: v_and_b32_e32 v58, 0xff, v53 +; GFX11-NEXT: v_lshlrev_b32_e32 v57, 8, v57 +; GFX11-NEXT: v_and_b32_e32 v47, 0xff, v47 +; GFX11-NEXT: v_lshlrev_b32_e32 v56, 8, v56 +; GFX11-NEXT: v_or_b32_e32 v53, v54, v55 +; GFX11-NEXT: v_and_b32_e32 v54, 0xffff, v81 +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v80 +; GFX11-NEXT: v_or_b32_e32 v80, v58, v57 +; GFX11-NEXT: v_or_b32_e32 v81, v47, v56 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-NEXT: v_lshlrev_b32_e32 v46, 8, v46 +; GFX11-NEXT: v_and_b32_e32 v45, 0xff, v45 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v70 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v67 -; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 -; GFX11-NEXT: v_or_b32_e32 v69, v69, v82 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v60 -; GFX11-NEXT: v_and_b32_e32 v60, 0xff, v61 -; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v64 -; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 -; GFX11-NEXT: v_or_b32_e32 v82, v50, v82 -; GFX11-NEXT: v_or_b32_e32 v81, v60, v81 -; GFX11-NEXT: v_or_b32_e32 v50, v52, v66 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v53 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v69 -; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v51 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v59 -; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 -; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 -; GFX11-NEXT: v_or_b32_e32 v51, v52, v53 -; GFX11-NEXT: v_and_b32_e32 v52, 0xffff, v82 -; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v81 -; GFX11-NEXT: v_or_b32_e32 v66, v66, v69 -; GFX11-NEXT: v_or_b32_e32 v69, v57, v58 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v56 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v47 -; GFX11-NEXT: v_or_b32_e32 v52, v52, v53 -; GFX11-NEXT: v_and_b32_e32 v53, 0xffff, v66 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v38, v38, v81 -; GFX11-NEXT: v_or_b32_e32 v69, v82, v80 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v46 -; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v44 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v45 -; GFX11-NEXT: v_or_b32_e32 v53, v53, v66 +; GFX11-NEXT: v_or_b32_e32 v54, v54, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xffff, v80 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v81 +; GFX11-NEXT: v_or_b32_e32 v38, v38, v46 +; GFX11-NEXT: v_or_b32_e32 v70, v45, v70 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v44 +; GFX11-NEXT: v_and_b32_e32 v42, 0xff, v42 +; GFX11-NEXT: v_lshlrev_b32_e32 v43, 8, v43 +; GFX11-NEXT: v_or_b32_e32 v55, v55, v80 ; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v39, v39, v80 -; GFX11-NEXT: v_or_b32_e32 v69, v81, v82 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v36 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v43 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v42 -; GFX11-NEXT: v_or_b32_e32 v36, v38, v66 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v70 +; GFX11-NEXT: v_or_b32_e32 v39, v39, v81 +; GFX11-NEXT: v_or_b32_e32 v80, v42, v43 +; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v36 +; GFX11-NEXT: v_lshlrev_b32_e32 v41, 8, v41 +; GFX11-NEXT: v_and_b32_e32 v40, 0xff, v40 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v69 +; GFX11-NEXT: v_or_b32_e32 v36, v38, v70 ; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v39 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v66, v80, v81 -; GFX11-NEXT: v_or_b32_e32 v69, v82, v70 -; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v37 -; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v41 -; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v183 -; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v40 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v80 +; GFX11-NEXT: v_or_b32_e32 v70, v81, v41 +; GFX11-NEXT: v_or_b32_e32 v69, v40, v69 +; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v37 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v183 +; GFX11-NEXT: v_and_b32_e32 v181, 0xff, v181 +; GFX11-NEXT: v_lshlrev_b32_e32 v182, 8, v182 ; GFX11-NEXT: v_or_b32_e32 v37, v38, v39 -; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v66 +; GFX11-NEXT: v_and_b32_e32 v38, 0xffff, v70 ; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v66, v70, v80 -; GFX11-NEXT: v_or_b32_e32 v69, v81, v82 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v181 -; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v182 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v180 -; GFX11-NEXT: v_and_b32_e32 v66, 0xffff, v66 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v69 -; GFX11-NEXT: v_or_b32_e32 v67, v80, v67 -; GFX11-NEXT: v_or_b32_e32 v32, v32, v70 -; GFX11-NEXT: v_or_b32_e32 v33, v33, v81 +; GFX11-NEXT: v_or_b32_e32 v69, v80, v81 +; GFX11-NEXT: v_or_b32_e32 v70, v181, v182 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v180 +; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v179 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v68 +; GFX11-NEXT: v_and_b32_e32 v69, 0xffff, v69 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v70 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v178, 8, v178 +; GFX11-NEXT: v_or_b32_e32 v32, v32, v80 +; GFX11-NEXT: v_or_b32_e32 v68, v81, v68 ; GFX11-NEXT: v_or_b32_e32 v38, v38, v39 -; GFX11-NEXT: v_or_b32_e32 v39, v66, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v67 -; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v178 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v179 +; GFX11-NEXT: v_or_b32_e32 v39, v69, v70 +; GFX11-NEXT: v_and_b32_e32 v69, 0xff, v176 ; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v177 -; GFX11-NEXT: v_and_b32_e32 v80, 0xff, v176 -; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v167 -; GFX11-NEXT: v_and_b32_e32 v82, 0xff, v165 -; GFX11-NEXT: v_lshlrev_b32_e32 v165, 8, v166 -; GFX11-NEXT: v_or_b32_e32 v67, v67, v69 -; GFX11-NEXT: v_or_b32_e32 v28, v28, v70 -; GFX11-NEXT: v_or_b32_e32 v64, v80, v64 -; GFX11-NEXT: v_or_b32_e32 v29, v29, v81 -; GFX11-NEXT: v_or_b32_e32 v69, v82, v165 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v167 +; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v166 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v67 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v165, 8, v165 +; GFX11-NEXT: v_and_b32_e32 v163, 0xff, v163 +; GFX11-NEXT: v_lshlrev_b32_e32 v164, 8, v164 +; GFX11-NEXT: v_or_b32_e32 v33, v33, v178 +; GFX11-NEXT: v_or_b32_e32 v69, v69, v70 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v80 +; GFX11-NEXT: v_or_b32_e32 v67, v81, v67 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v165 +; GFX11-NEXT: v_or_b32_e32 v70, v163, v164 ; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GFX11-NEXT: v_lshlrev_b32_e32 v68, 16, v68 ; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v67 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v69 ; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v64, 16, v64 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v67 ; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v69 +; GFX11-NEXT: v_lshlrev_b32_e32 v70, 16, v70 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[50:53], off +; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off ; GFX11-NEXT: scratch_store_b128 v0, v[36:39], off offset:16 -; GFX11-NEXT: v_or_b32_e32 v36, v32, v66 -; GFX11-NEXT: v_or_b32_e32 v37, v33, v67 -; GFX11-NEXT: v_or_b32_e32 v38, v28, v64 -; GFX11-NEXT: v_or_b32_e32 v39, v29, v69 -; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v164 -; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v163 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v54 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v162 -; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v160 -; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v161 +; GFX11-NEXT: v_or_b32_e32 v36, v32, v68 +; GFX11-NEXT: v_or_b32_e32 v37, v33, v69 +; GFX11-NEXT: v_or_b32_e32 v38, v28, v67 +; GFX11-NEXT: v_or_b32_e32 v39, v29, v70 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v162 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v161 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v160 +; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v150 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v151 ; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v151 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v149 ; GFX11-NEXT: v_or_b32_e32 v24, v24, v28 ; GFX11-NEXT: v_or_b32_e32 v28, v29, v32 ; GFX11-NEXT: v_or_b32_e32 v25, v25, v33 -; GFX11-NEXT: v_or_b32_e32 v29, v50, v51 -; GFX11-NEXT: v_or_b32_e32 v20, v20, v52 -; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v150 -; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v48 +; GFX11-NEXT: v_or_b32_e32 v29, v52, v53 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v54 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v148 +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 8, v50 ; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v149 -; GFX11-NEXT: v_and_b32_e32 v50, 0xff, v148 -; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v147 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v147 +; GFX11-NEXT: v_and_b32_e32 v52, 0xff, v146 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v145 ; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v146 -; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v145 -; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v68 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v144 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v135 +; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v64 ; GFX11-NEXT: v_or_b32_e32 v32, v32, v33 -; GFX11-NEXT: v_or_b32_e32 v21, v21, v48 -; GFX11-NEXT: v_or_b32_e32 v33, v50, v51 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v52 -; GFX11-NEXT: v_or_b32_e32 v48, v53, v54 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v50 +; GFX11-NEXT: v_or_b32_e32 v33, v52, v53 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v54 +; GFX11-NEXT: v_or_b32_e32 v50, v55, v64 ; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 @@ -219141,39 +218630,39 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v33 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v50 ; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-NEXT: v_or_b32_e32 v50, v24, v28 -; GFX11-NEXT: v_or_b32_e32 v52, v20, v32 -; GFX11-NEXT: v_or_b32_e32 v53, v21, v33 -; GFX11-NEXT: v_or_b32_e32 v64, v15, v48 +; GFX11-NEXT: v_or_b32_e32 v52, v24, v28 +; GFX11-NEXT: v_or_b32_e32 v54, v20, v32 +; GFX11-NEXT: v_or_b32_e32 v55, v21, v33 +; GFX11-NEXT: v_or_b32_e32 v66, v15, v50 ; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v16 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v144 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v135 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v134 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v134 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v133 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v132 ; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v133 -; GFX11-NEXT: v_or_b32_e32 v51, v25, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v132 -; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v65 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v131 +; GFX11-NEXT: v_or_b32_e32 v53, v25, v29 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v130 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 8, v48 ; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v131 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v129 ; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-NEXT: v_or_b32_e32 v16, v20, v21 ; GFX11-NEXT: v_or_b32_e32 v13, v13, v24 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v130 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v129 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v128 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v119 ; GFX11-NEXT: v_or_b32_e32 v20, v25, v28 ; GFX11-NEXT: v_or_b32_e32 v14, v14, v29 ; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v128 -; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v119 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v118 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v117 ; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v34 ; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v118 -; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v117 -; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v116 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 8, v116 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v115 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 8, v114 ; GFX11-NEXT: v_or_b32_e32 v21, v21, v24 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -219189,36 +218678,36 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-NEXT: v_or_b32_e32 v65, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v66, v13, v20 -; GFX11-NEXT: v_or_b32_e32 v67, v14, v21 +; GFX11-NEXT: v_or_b32_e32 v67, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v68, v13, v20 +; GFX11-NEXT: v_or_b32_e32 v69, v14, v21 ; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v115 -; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v113 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v112 ; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v30 ; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v113 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v112 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v103 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v103 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v102 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v101 ; GFX11-NEXT: v_or_b32_e32 v11, v11, v24 ; GFX11-NEXT: v_or_b32_e32 v12, v12, v25 ; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v102 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v100 ; GFX11-NEXT: v_or_b32_e32 v9, v9, v13 ; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 ; GFX11-NEXT: v_or_b32_e32 v10, v10, v16 ; GFX11-NEXT: v_or_b32_e32 v14, v20, v21 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v101 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v99 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v26 ; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v100 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 8, v98 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v97 -; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v96 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v87 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v86 ; GFX11-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; GFX11-NEXT: v_or_b32_e32 v7, v7, v24 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v99 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v98 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v97 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 8, v96 ; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 ; GFX11-NEXT: v_or_b32_e32 v8, v8, v20 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v25 @@ -219239,16 +218728,16 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_or_b32_e32 v7, v7, v15 ; GFX11-NEXT: v_or_b32_e32 v9, v5, v20 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v87 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v86 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v85 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v83 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v84 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v83 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v71 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; GFX11-NEXT: v_or_b32_e32 v8, v8, v16 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v84 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v82 ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v71 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v65 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX11-NEXT: v_or_b32_e32 v6, v10, v15 ; GFX11-NEXT: v_or_b32_e32 v10, v20, v18 @@ -219257,7 +218746,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_lshlrev_b32_e32 v16, 8, v49 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v55 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v51 ; GFX11-NEXT: v_lshlrev_b32_e32 v18, 8, v35 ; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v31 ; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v17 @@ -219286,78 +218775,76 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_or_b32_e32 v4, v20, v17 ; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: scratch_store_b128 v0, v[36:39], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[50:53], off offset:48 -; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[66:69], off offset:64 ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v74, off, s32 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:72 -; GFX11-NEXT: v_readlane_b32 s104, v76, 8 -; GFX11-NEXT: v_readlane_b32 s103, v76, 7 -; GFX11-NEXT: v_readlane_b32 s102, v76, 6 -; GFX11-NEXT: v_readlane_b32 s101, v76, 5 -; GFX11-NEXT: v_readlane_b32 s100, v76, 4 -; GFX11-NEXT: v_readlane_b32 s99, v76, 3 -; GFX11-NEXT: v_readlane_b32 s98, v76, 2 -; GFX11-NEXT: v_readlane_b32 s97, v76, 1 -; GFX11-NEXT: v_readlane_b32 s96, v76, 0 -; GFX11-NEXT: v_readlane_b32 s87, v75, 31 -; GFX11-NEXT: v_readlane_b32 s86, v75, 30 -; GFX11-NEXT: v_readlane_b32 s85, v75, 29 -; GFX11-NEXT: v_readlane_b32 s84, v75, 28 -; GFX11-NEXT: v_readlane_b32 s83, v75, 27 -; GFX11-NEXT: v_readlane_b32 s82, v75, 26 -; GFX11-NEXT: v_readlane_b32 s81, v75, 25 -; GFX11-NEXT: v_readlane_b32 s80, v75, 24 -; GFX11-NEXT: v_readlane_b32 s71, v75, 23 -; GFX11-NEXT: v_readlane_b32 s70, v75, 22 -; GFX11-NEXT: v_readlane_b32 s69, v75, 21 -; GFX11-NEXT: v_readlane_b32 s68, v75, 20 -; GFX11-NEXT: v_readlane_b32 s67, v75, 19 -; GFX11-NEXT: v_readlane_b32 s66, v75, 18 -; GFX11-NEXT: v_readlane_b32 s65, v75, 17 -; GFX11-NEXT: v_readlane_b32 s64, v75, 16 -; GFX11-NEXT: v_readlane_b32 s55, v75, 15 -; GFX11-NEXT: v_readlane_b32 s54, v75, 14 -; GFX11-NEXT: v_readlane_b32 s53, v75, 13 -; GFX11-NEXT: v_readlane_b32 s52, v75, 12 -; GFX11-NEXT: v_readlane_b32 s51, v75, 11 -; GFX11-NEXT: v_readlane_b32 s50, v75, 10 -; GFX11-NEXT: v_readlane_b32 s49, v75, 9 -; GFX11-NEXT: v_readlane_b32 s48, v75, 8 -; GFX11-NEXT: v_readlane_b32 s39, v75, 7 -; GFX11-NEXT: v_readlane_b32 s38, v75, 6 -; GFX11-NEXT: v_readlane_b32 s37, v75, 5 -; GFX11-NEXT: v_readlane_b32 s36, v75, 4 -; GFX11-NEXT: v_readlane_b32 s35, v75, 3 -; GFX11-NEXT: v_readlane_b32 s34, v75, 2 -; GFX11-NEXT: v_readlane_b32 s31, v75, 1 -; GFX11-NEXT: v_readlane_b32 s30, v75, 0 +; GFX11-NEXT: s_clause 0x10 ; 68-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v72, off, s32 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:64 +; GFX11-NEXT: v_readlane_b32 s104, v74, 8 +; GFX11-NEXT: v_readlane_b32 s103, v74, 7 +; GFX11-NEXT: v_readlane_b32 s102, v74, 6 +; GFX11-NEXT: v_readlane_b32 s101, v74, 5 +; GFX11-NEXT: v_readlane_b32 s100, v74, 4 +; GFX11-NEXT: v_readlane_b32 s99, v74, 3 +; GFX11-NEXT: v_readlane_b32 s98, v74, 2 +; GFX11-NEXT: v_readlane_b32 s97, v74, 1 +; GFX11-NEXT: v_readlane_b32 s96, v74, 0 +; GFX11-NEXT: v_readlane_b32 s87, v73, 31 +; GFX11-NEXT: v_readlane_b32 s86, v73, 30 +; GFX11-NEXT: v_readlane_b32 s85, v73, 29 +; GFX11-NEXT: v_readlane_b32 s84, v73, 28 +; GFX11-NEXT: v_readlane_b32 s83, v73, 27 +; GFX11-NEXT: v_readlane_b32 s82, v73, 26 +; GFX11-NEXT: v_readlane_b32 s81, v73, 25 +; GFX11-NEXT: v_readlane_b32 s80, v73, 24 +; GFX11-NEXT: v_readlane_b32 s71, v73, 23 +; GFX11-NEXT: v_readlane_b32 s70, v73, 22 +; GFX11-NEXT: v_readlane_b32 s69, v73, 21 +; GFX11-NEXT: v_readlane_b32 s68, v73, 20 +; GFX11-NEXT: v_readlane_b32 s67, v73, 19 +; GFX11-NEXT: v_readlane_b32 s66, v73, 18 +; GFX11-NEXT: v_readlane_b32 s65, v73, 17 +; GFX11-NEXT: v_readlane_b32 s64, v73, 16 +; GFX11-NEXT: v_readlane_b32 s55, v73, 15 +; GFX11-NEXT: v_readlane_b32 s54, v73, 14 +; GFX11-NEXT: v_readlane_b32 s53, v73, 13 +; GFX11-NEXT: v_readlane_b32 s52, v73, 12 +; GFX11-NEXT: v_readlane_b32 s51, v73, 11 +; GFX11-NEXT: v_readlane_b32 s50, v73, 10 +; GFX11-NEXT: v_readlane_b32 s49, v73, 9 +; GFX11-NEXT: v_readlane_b32 s48, v73, 8 +; GFX11-NEXT: v_readlane_b32 s39, v73, 7 +; GFX11-NEXT: v_readlane_b32 s38, v73, 6 +; GFX11-NEXT: v_readlane_b32 s37, v73, 5 +; GFX11-NEXT: v_readlane_b32 s36, v73, 4 +; GFX11-NEXT: v_readlane_b32 s35, v73, 3 +; GFX11-NEXT: v_readlane_b32 s34, v73, 2 +; GFX11-NEXT: v_readlane_b32 s31, v73, 1 +; GFX11-NEXT: v_readlane_b32 s30, v73, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:72 ; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:88 ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -224276,650 +223763,644 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 -; VI-NEXT: v_mov_b32_e32 v48, v15 -; VI-NEXT: v_mov_b32_e32 v49, v13 -; VI-NEXT: v_mov_b32_e32 v50, v11 -; VI-NEXT: v_mov_b32_e32 v51, v9 -; VI-NEXT: v_mov_b32_e32 v52, v7 -; VI-NEXT: v_mov_b32_e32 v53, v5 -; VI-NEXT: v_mov_b32_e32 v54, v3 -; VI-NEXT: v_mov_b32_e32 v55, v1 +; VI-NEXT: v_mov_b32_e32 v29, v15 ; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 ; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 -; VI-NEXT: v_mov_b32_e32 v40, v4 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 ; VI-NEXT: v_mov_b32_e32 v16, v2 -; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v39, s17 -; VI-NEXT: v_mov_b32_e32 v38, s19 -; VI-NEXT: v_mov_b32_e32 v37, s21 -; VI-NEXT: v_mov_b32_e32 v36, s23 -; VI-NEXT: v_mov_b32_e32 v35, s25 -; VI-NEXT: v_mov_b32_e32 v34, s27 -; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v43, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB101_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB101_3 ; VI-NEXT: .LBB101_2: ; %cmp.true -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v15, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v15, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v15, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v40, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v52, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v22, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v42, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v24, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v26, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v44, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v28, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v48, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v46, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v18, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v14 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v16, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v14 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v16, v33, vcc +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v16, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v14 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v17, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v16, v16, v33, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v17, v33, vcc +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v17, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v33, v17, v33, vcc +; VI-NEXT: v_bfe_u32 v17, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v17, v32, vcc +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v32, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v14 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v19, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v34, v32, v34, vcc +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v14 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v35, v19, v32, vcc +; VI-NEXT: v_bfe_u32 v19, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v14 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v20, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v19, v32, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v14 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v50, v20, v32, vcc +; VI-NEXT: v_bfe_u32 v20, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v14 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v21, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v20, v32, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v14 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v51, v21, v32, vcc +; VI-NEXT: v_bfe_u32 v21, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v14 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v22 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v22, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v21, v32, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v14 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v52, v22, v32, vcc +; VI-NEXT: v_bfe_u32 v22, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v14 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v23 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v23, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v22, v32, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v14 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v53, v23, v32, vcc +; VI-NEXT: v_bfe_u32 v23, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v14 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v24 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v24, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v23, v32, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v14 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v54, v24, v32, vcc +; VI-NEXT: v_bfe_u32 v24, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v14 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v25 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v25, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v24, v32, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v14 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v55, v25, v32, vcc +; VI-NEXT: v_bfe_u32 v25, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v14 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v26 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v26, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v25, v25, v32, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v14 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v40, v26, v32, vcc +; VI-NEXT: v_bfe_u32 v26, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v14 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v27 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v27, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v26, v26, v32, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v14 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v41, v27, v32, vcc +; VI-NEXT: v_bfe_u32 v27, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v14 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v28 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v28, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v27, v27, v32, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v14 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v42, v28, v32, vcc +; VI-NEXT: v_bfe_u32 v28, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v14 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v29 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v29, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v28, v28, v32, vcc +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v14 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v49, v29, v32, vcc +; VI-NEXT: v_bfe_u32 v29, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v14 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v30 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v30, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v29, v29, v32, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v14 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v17, v30, v32, vcc +; VI-NEXT: v_bfe_u32 v30, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v14 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v31, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v30, v30, v32, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v45, v31, v32, vcc +; VI-NEXT: v_bfe_u32 v31, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v30, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v16, v3, v5, vcc -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v56, v3, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v58, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v7, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v7, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v1 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; VI-NEXT: v_bfe_u32 v6, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v60, v7, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; VI-NEXT: v_bfe_u32 v6, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_bfe_u32 v14, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; VI-NEXT: v_cndmask_b32_e32 v32, v14, v32, vcc +; VI-NEXT: v_bfe_u32 v14, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v9, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v1 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v62, v9, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; VI-NEXT: v_cndmask_b32_e32 v46, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; VI-NEXT: v_bfe_u32 v8, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; VI-NEXT: v_bfe_u32 v11, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v11, v13, vcc -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v2 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v47, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v2 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v56, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v4 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v57, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v4 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v4, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v5 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v58, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v5 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v5, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v6 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v59, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v6 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v6, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v7 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v60, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v7 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v7, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v8 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v61, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v8 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v8, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v9 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v62, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v9 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_bfe_u32 v13, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; VI-NEXT: v_cndmask_b32_e32 v9, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v10 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 -; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v63, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v10 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v11, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v34 -; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_bfe_u32 v19, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v15 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v10, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v11 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v39, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v11 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_bfe_u32 v15, v12, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v32, v19, v32, vcc -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v12 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; VI-NEXT: v_cndmask_b32_e32 v11, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v12 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v13 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v15, v15, v19, vcc -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v48, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v12 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_bfe_u32 v19, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v15 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc -; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v19, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v12, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v38, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v36, vcc +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_bfe_u32 v34, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v14 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v14 +; VI-NEXT: v_bfe_u32 v36, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v14 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v15 -; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v43, v36, v37, vcc +; VI-NEXT: v_bfe_u32 v36, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v14 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v15, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; VI-NEXT: v_and_b32_e32 v34, 0xffff0000, v55 -; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v55 -; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v14, v36, v37, vcc +; VI-NEXT: v_bfe_u32 v36, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v15 ; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v34 -; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[56:57] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_mov_b32_e32 v39, v34 -; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[58:59] -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: v_mov_b32_e32 v38, v34 -; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[60:61] -; VI-NEXT: v_cndmask_b32_e32 v54, v36, v37, vcc -; VI-NEXT: v_mov_b32_e32 v37, v34 -; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[62:63] -; VI-NEXT: v_mov_b32_e32 v35, v0 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[32:33] -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v36, v34 -; VI-NEXT: v_mov_b32_e32 v34, v0 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[54:55] -; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[14:15] -; VI-NEXT: v_mov_b32_e32 v55, v0 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[40:41] -; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] -; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[12:13] -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9] -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] -; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[28:29] -; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[26:27] -; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[24:25] -; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] -; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[32:33] -; VI-NEXT: v_mov_b32_e32 v54, v32 -; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[52:53] -; VI-NEXT: v_mov_b32_e32 v53, v0 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[42:43] -; VI-NEXT: v_mov_b32_e32 v52, v32 -; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[50:51] -; VI-NEXT: v_mov_b32_e32 v51, v0 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[44:45] -; VI-NEXT: v_mov_b32_e32 v50, v32 -; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[48:49] -; VI-NEXT: v_mov_b32_e32 v49, v0 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[16:17] -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v48, v32 -; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[30:31] -; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[46:47] -; VI-NEXT: v_mov_b32_e32 v31, v32 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[15:16] -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v36, v37, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[14:15] +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v43 +; VI-NEXT: v_lshrrev_b64 v[43:44], 16, v[13:14] +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v38 +; VI-NEXT: v_lshrrev_b64 v[37:38], 16, v[12:13] +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v48 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[10:11] +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v63 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v62 +; VI-NEXT: v_lshrrev_b64 v[62:63], 16, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v61 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v60 +; VI-NEXT: v_lshrrev_b64 v[60:61], 16, v[6:7] +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v59 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v58 +; VI-NEXT: v_lshrrev_b64 v[58:59], 16, v[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v57 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; VI-NEXT: v_lshrrev_b64 v[56:57], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v47 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshrrev_b64 v[46:47], 16, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[31:32] +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v45 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[30:31] +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[29:30] +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v49 +; VI-NEXT: v_lshrrev_b64 v[31:32], 16, v[28:29] +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v42 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[27:28] +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 +; VI-NEXT: v_lshrrev_b64 v[41:42], 16, v[26:27] +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v40 +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[25:26] +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v55 +; VI-NEXT: v_lshrrev_b64 v[39:40], 16, v[24:25] +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v54 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[23:24] +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; VI-NEXT: v_lshrrev_b64 v[53:54], 16, v[22:23] +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v52 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[21:22] +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v51 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshrrev_b64 v[51:52], 16, v[20:21] +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v50 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[19:20] +; VI-NEXT: v_mov_b32_e32 v3, v56 +; VI-NEXT: v_mov_b32_e32 v5, v58 +; VI-NEXT: v_mov_b32_e32 v7, v60 +; VI-NEXT: v_mov_b32_e32 v9, v62 +; VI-NEXT: v_mov_b32_e32 v11, v38 +; VI-NEXT: v_mov_b32_e32 v13, v37 +; VI-NEXT: v_mov_b32_e32 v15, v36 +; VI-NEXT: v_mov_b32_e32 v21, v51 +; VI-NEXT: v_mov_b32_e32 v23, v53 +; VI-NEXT: v_mov_b32_e32 v25, v39 +; VI-NEXT: v_mov_b32_e32 v27, v41 +; VI-NEXT: v_mov_b32_e32 v29, v31 +; VI-NEXT: v_mov_b32_e32 v31, v14 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[33:34] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[33:34], 16, v[16:17] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[18:19] +; VI-NEXT: v_mov_b32_e32 v1, v46 +; VI-NEXT: v_mov_b32_e32 v17, v33 +; VI-NEXT: v_mov_b32_e32 v19, v49 ; VI-NEXT: .LBB101_3: ; %end -; VI-NEXT: v_mov_b32_e32 v13, v18 -; VI-NEXT: v_mov_b32_e32 v18, v40 +; VI-NEXT: v_mov_b32_e32 v14, v43 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -224936,20 +224417,7 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v1, v39 -; VI-NEXT: v_mov_b32_e32 v3, v38 -; VI-NEXT: v_mov_b32_e32 v5, v37 -; VI-NEXT: v_mov_b32_e32 v7, v36 -; VI-NEXT: v_mov_b32_e32 v9, v35 -; VI-NEXT: v_mov_b32_e32 v11, v34 -; VI-NEXT: v_mov_b32_e32 v15, v55 -; VI-NEXT: v_mov_b32_e32 v17, v54 -; VI-NEXT: v_mov_b32_e32 v19, v53 -; VI-NEXT: v_mov_b32_e32 v21, v52 -; VI-NEXT: v_mov_b32_e32 v23, v51 -; VI-NEXT: v_mov_b32_e32 v25, v50 -; VI-NEXT: v_mov_b32_e32 v27, v49 -; VI-NEXT: v_mov_b32_e32 v29, v48 +; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB101_4: @@ -233532,710 +233000,611 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 -; SI-NEXT: v_mov_b32_e32 v50, v27 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v30 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v44 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v46 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v56 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v58 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v60 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v45 -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v47 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v57 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v59 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v60 -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v31, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v62 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v63 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v41, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s29 +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB105_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_mov_b32_e32 v56, v36 +; SI-NEXT: v_mov_b32_e32 v58, v5 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v60, v40 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; SI-NEXT: v_mov_b32_e32 v63, v47 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 +; SI-NEXT: v_mov_b32_e32 v44, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v38 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v42 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v60 -; SI-NEXT: v_mov_b32_e32 v28, v3 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; SI-NEXT: v_mov_b32_e32 v3, v9 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v52 -; SI-NEXT: v_mov_b32_e32 v50, v48 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 -; SI-NEXT: v_mov_b32_e32 v38, v13 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v23 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: v_mov_b32_e32 v52, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v48 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v49 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v7, v40 -; SI-NEXT: v_mov_b32_e32 v48, v27 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v53 -; SI-NEXT: v_mov_b32_e32 v53, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v46 -; SI-NEXT: v_mov_b32_e32 v46, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v61 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_mov_b32_e32 v53, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v31 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v40 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v40, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v47 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mov_b32_e32 v21, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_mov_b32_e32 v39, v35 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v44 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_mov_b32_e32 v5, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v45 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v35 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v42 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v60 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 ; SI-NEXT: s_branch .LBB105_3 ; SI-NEXT: .LBB105_2: +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_mov_b32_e32 v45, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: v_mov_b32_e32 v7, v40 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v50, v48 -; SI-NEXT: v_mov_b32_e32 v48, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v58, v5 +; SI-NEXT: v_mov_b32_e32 v56, v36 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_mov_b32_e32 v44, v21 +; SI-NEXT: v_mov_b32_e32 v53, v25 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v46, v9 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v53, v58 -; SI-NEXT: v_mov_b32_e32 v28, v3 -; SI-NEXT: v_mov_b32_e32 v38, v13 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v60, v40 +; SI-NEXT: v_mov_b32_e32 v63, v47 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v52, v63 -; SI-NEXT: v_mov_b32_e32 v5, v23 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_mov_b32_e32 v42, v30 +; SI-NEXT: v_mov_b32_e32 v40, v32 ; SI-NEXT: .LBB105_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v13, v37 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v19, v48 -; SI-NEXT: v_mov_b32_e32 v63, v7 -; SI-NEXT: v_mov_b32_e32 v58, v53 -; SI-NEXT: v_mov_b32_e32 v37, v27 -; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: v_mov_b32_e32 v34, v3 +; SI-NEXT: v_mov_b32_e32 v62, v1 ; SI-NEXT: s_cbranch_vccnz .LBB105_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_mov_b32_e32 v7, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_mov_b32_e32 v17, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; SI-NEXT: v_mov_b32_e32 v11, v50 -; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_mov_b32_e32 v51, v63 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v63, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v61, 0x40c00000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v30 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -234244,302 +233613,403 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v45, 0x40c00000, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v3 +; SI-NEXT: v_lshr_b64 v[4:5], v[45:46], 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: v_lshr_b64 v[8:9], v[8:9], 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_mov_b32_e32 v9, v11 +; SI-NEXT: v_lshr_b64 v[10:11], v[10:11], 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_mov_b32_e32 v11, v13 +; SI-NEXT: v_lshr_b64 v[12:13], v[12:13], 16 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v53 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; SI-NEXT: v_mov_b32_e32 v13, v15 +; SI-NEXT: v_lshr_b64 v[14:15], v[14:15], 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v16 +; SI-NEXT: v_lshr_b64 v[16:17], v[33:34], 16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v63 +; SI-NEXT: v_lshr_b64 v[62:63], v[1:2], 16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; SI-NEXT: v_mov_b32_e32 v17, v19 +; SI-NEXT: v_lshr_b64 v[18:19], v[18:19], 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshr_b64 v[20:21], v[36:37], 16 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_mov_b32_e32 v21, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[22:23], 16 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; SI-NEXT: v_lshr_b64 v[24:25], v[51:52], 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v25 +; SI-NEXT: v_lshr_b64 v[26:27], v[53:54], 16 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; SI-NEXT: v_lshr_b64 v[28:29], v[38:39], 16 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_lshr_b64 v[49:50], v[30:31], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 +; SI-NEXT: v_mov_b32_e32 v29, v31 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[29:30], v[48:49], 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[56:57], v[40:41], 16 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_mov_b32_e32 v29, v56 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[29:30], v[55:56], 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[2:3], v[27:28], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; SI-NEXT: v_mov_b32_e32 v4, v38 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[29:30], v[27:28], 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[4:5], v[37:38], 16 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; SI-NEXT: v_mov_b32_e32 v6, v49 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[29:30], v[25:26], 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[6:7], v[48:49], 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 -; SI-NEXT: v_mov_b32_e32 v8, v51 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[29:30], v[23:24], 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[8:9], v[50:51], 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v52 -; SI-NEXT: v_lshr_b64 v[51:52], v[61:62], 16 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshr_b64 v[10:11], v[27:28], 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshr_b64 v[12:13], v[37:38], 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 -; SI-NEXT: v_lshr_b64 v[14:15], v[53:54], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_mov_b32_e32 v16, v33 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[29:30], v[21:22], 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[16:17], v[32:33], 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v18 -; SI-NEXT: v_mov_b32_e32 v18, v41 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[29:30], v[19:20], 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[18:19], v[40:41], 16 -; SI-NEXT: v_mov_b32_e32 v39, v59 -; SI-NEXT: v_mov_b32_e32 v40, v60 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v20 -; SI-NEXT: v_lshr_b64 v[20:21], v[39:40], 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 -; SI-NEXT: v_lshr_b64 v[22:23], v[56:57], 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v24 -; SI-NEXT: v_lshr_b64 v[24:25], v[46:47], 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v26 -; SI-NEXT: v_lshr_b64 v[26:27], v[35:36], 16 -; SI-NEXT: v_mov_b32_e32 v27, v30 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v34 -; SI-NEXT: v_lshr_b64 v[33:34], v[29:30], 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v31 -; SI-NEXT: v_mov_b32_e32 v30, v44 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[29:30], v[17:18], 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[30:31], v[43:44], 16 -; SI-NEXT: v_lshr_b64 v[43:44], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[44:45], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[25:26], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[23:24], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[21:22], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[19:20], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[17:18], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[15:16], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[29:30], v[11:12], 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[13:14], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[29:30], v[9:10], 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[11:12], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[29:30], v[7:8], 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[31:32], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[5:6], 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[27:28], v[7:8], 16 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[29:30], v[3:4], 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: .LBB105_5: ; %end -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -234547,16 +234017,16 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -234564,16 +234034,16 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -234581,16 +234051,19 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -234598,16 +234071,16 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -234615,19 +234088,19 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -234635,19 +234108,16 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -234655,16 +234125,19 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -234672,15 +234145,15 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload @@ -234692,15 +234165,15 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload @@ -234712,59 +234185,62 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -234811,650 +234287,644 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v31, v17 ; VI-NEXT: v_mov_b32_e32 v30, v16 -; VI-NEXT: v_mov_b32_e32 v48, v15 -; VI-NEXT: v_mov_b32_e32 v49, v13 -; VI-NEXT: v_mov_b32_e32 v50, v11 -; VI-NEXT: v_mov_b32_e32 v51, v9 -; VI-NEXT: v_mov_b32_e32 v52, v7 -; VI-NEXT: v_mov_b32_e32 v53, v5 -; VI-NEXT: v_mov_b32_e32 v54, v3 -; VI-NEXT: v_mov_b32_e32 v55, v1 +; VI-NEXT: v_mov_b32_e32 v29, v15 ; VI-NEXT: v_mov_b32_e32 v28, v14 +; VI-NEXT: v_mov_b32_e32 v27, v13 ; VI-NEXT: v_mov_b32_e32 v26, v12 +; VI-NEXT: v_mov_b32_e32 v25, v11 ; VI-NEXT: v_mov_b32_e32 v24, v10 +; VI-NEXT: v_mov_b32_e32 v23, v9 ; VI-NEXT: v_mov_b32_e32 v22, v8 +; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 -; VI-NEXT: v_mov_b32_e32 v40, v4 +; VI-NEXT: v_mov_b32_e32 v19, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v17, v3 ; VI-NEXT: v_mov_b32_e32 v16, v2 -; VI-NEXT: v_mov_b32_e32 v14, v0 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v39, s17 -; VI-NEXT: v_mov_b32_e32 v38, s19 -; VI-NEXT: v_mov_b32_e32 v37, s21 -; VI-NEXT: v_mov_b32_e32 v36, s23 -; VI-NEXT: v_mov_b32_e32 v35, s25 -; VI-NEXT: v_mov_b32_e32 v34, s27 -; VI-NEXT: v_mov_b32_e32 v18, s29 +; VI-NEXT: v_mov_b32_e32 v15, v1 +; VI-NEXT: v_mov_b32_e32 v43, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v2, s18 +; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 +; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 +; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB105_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB105_3 ; VI-NEXT: .LBB105_2: ; %cmp.true -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v15, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v15, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v15, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v40, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v52, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v22 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v22, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v42, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v24, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v50, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v26, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v44, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v28, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v48, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v46, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v18, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v14 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v16, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v18, v18, v33, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v14 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v16, v33, vcc +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v16, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v14 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v17, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v16, v16, v33, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v17, v33, vcc +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v17, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v33, v17, v33, vcc +; VI-NEXT: v_bfe_u32 v17, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v17, v32, vcc +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v19 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v32, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v14 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v19, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v34, v32, v34, vcc +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v14 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v35, v19, v32, vcc +; VI-NEXT: v_bfe_u32 v19, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v14 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v20, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v19, v19, v32, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v14 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v50, v20, v32, vcc +; VI-NEXT: v_bfe_u32 v20, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v14 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v21, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v20, v20, v32, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v14 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v51, v21, v32, vcc +; VI-NEXT: v_bfe_u32 v21, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v14 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v22 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v22, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v21, v21, v32, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v14 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v52, v22, v32, vcc +; VI-NEXT: v_bfe_u32 v22, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v14 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v23 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v23, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v22, v32, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v14 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v53, v23, v32, vcc +; VI-NEXT: v_bfe_u32 v23, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v14 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v24 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v24, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v23, v32, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v14 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v54, v24, v32, vcc +; VI-NEXT: v_bfe_u32 v24, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v14 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v25 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v25, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v24, v24, v32, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v14 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v55, v25, v32, vcc +; VI-NEXT: v_bfe_u32 v25, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v14 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v26 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v26, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v25, v25, v32, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v14 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v40, v26, v32, vcc +; VI-NEXT: v_bfe_u32 v26, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v14 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v27 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v27, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v26, v26, v32, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v14 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v41, v27, v32, vcc +; VI-NEXT: v_bfe_u32 v27, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v14 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v28 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v28, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v27, v27, v32, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v14 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v42, v28, v32, vcc +; VI-NEXT: v_bfe_u32 v28, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v14 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v29 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v29, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v28, v28, v32, vcc +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v14 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v49, v29, v32, vcc +; VI-NEXT: v_bfe_u32 v29, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v14 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v30 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v30, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v29, v29, v32, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v14 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v17, v30, v32, vcc +; VI-NEXT: v_bfe_u32 v30, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v14 +; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v31 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v31, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v30, v30, v32, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v45, v31, v32, vcc +; VI-NEXT: v_bfe_u32 v31, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v30, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v16, v3, v5, vcc -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v56, v3, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v58, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v7, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v7, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v1 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; VI-NEXT: v_bfe_u32 v6, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v60, v7, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v1 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; VI-NEXT: v_bfe_u32 v6, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_bfe_u32 v14, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; VI-NEXT: v_cndmask_b32_e32 v32, v14, v32, vcc +; VI-NEXT: v_bfe_u32 v14, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v9, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v1 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v62, v9, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; VI-NEXT: v_cndmask_b32_e32 v46, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; VI-NEXT: v_bfe_u32 v8, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; VI-NEXT: v_bfe_u32 v11, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v11, v13, vcc -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v2 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v47, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v2 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v56, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v3 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v4 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v57, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v4 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v4, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v5 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v58, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v5 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v5, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v6 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v59, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v6 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v6, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v7 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v60, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v7 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v7, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v8 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v61, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v8 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v8, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v9 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v62, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v9 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_bfe_u32 v13, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; VI-NEXT: v_cndmask_b32_e32 v9, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v10 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 -; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v63, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v10 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v11, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v34 -; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v19, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_bfe_u32 v19, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v15 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v10, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v11 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v39, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v11 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_bfe_u32 v15, v12, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v32, v19, v32, vcc -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v12 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; VI-NEXT: v_cndmask_b32_e32 v11, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v12 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v13 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v15, v15, v19, vcc -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v48, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v12 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_bfe_u32 v19, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v15 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc -; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v19, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v14 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v12, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v38, v14, v36, vcc +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v36, vcc +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v43 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_bfe_u32 v34, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v14 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v14 +; VI-NEXT: v_bfe_u32 v36, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v14 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v15 -; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v43, v36, v37, vcc +; VI-NEXT: v_bfe_u32 v36, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v14 +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v15, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; VI-NEXT: v_and_b32_e32 v34, 0xffff0000, v55 -; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 -; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 -; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x7fff, v35 -; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 -; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc -; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v55 -; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 -; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v14, v36, v37, vcc +; VI-NEXT: v_bfe_u32 v36, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v15 ; VI-NEXT: v_add_u32_e32 v36, vcc, 0x7fff, v36 -; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v34 -; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[56:57] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_mov_b32_e32 v39, v34 -; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[58:59] -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: v_mov_b32_e32 v38, v34 -; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[60:61] -; VI-NEXT: v_cndmask_b32_e32 v54, v36, v37, vcc -; VI-NEXT: v_mov_b32_e32 v37, v34 -; VI-NEXT: v_lshrrev_b64 v[34:35], 16, v[62:63] -; VI-NEXT: v_mov_b32_e32 v35, v0 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[32:33] -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v36, v34 -; VI-NEXT: v_mov_b32_e32 v34, v0 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[54:55] -; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[14:15] -; VI-NEXT: v_mov_b32_e32 v55, v0 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[40:41] -; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] -; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[12:13] -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9] -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] -; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[28:29] -; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[26:27] -; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[24:25] -; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] -; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[32:33] -; VI-NEXT: v_mov_b32_e32 v54, v32 -; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[52:53] -; VI-NEXT: v_mov_b32_e32 v53, v0 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[42:43] -; VI-NEXT: v_mov_b32_e32 v52, v32 -; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[50:51] -; VI-NEXT: v_mov_b32_e32 v51, v0 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[44:45] -; VI-NEXT: v_mov_b32_e32 v50, v32 -; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[48:49] -; VI-NEXT: v_mov_b32_e32 v49, v0 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[16:17] -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v48, v32 -; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[30:31] -; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[46:47] -; VI-NEXT: v_mov_b32_e32 v31, v32 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b64 v[40:41], 16, v[15:16] -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v36, v37, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[36:37], 16, v[14:15] +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v43 +; VI-NEXT: v_lshrrev_b64 v[43:44], 16, v[13:14] +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v38 +; VI-NEXT: v_lshrrev_b64 v[37:38], 16, v[12:13] +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v48 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; VI-NEXT: v_lshrrev_b64 v[38:39], 16, v[10:11] +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v63 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v62 +; VI-NEXT: v_lshrrev_b64 v[62:63], 16, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v61 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v60 +; VI-NEXT: v_lshrrev_b64 v[60:61], 16, v[6:7] +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v59 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v58 +; VI-NEXT: v_lshrrev_b64 v[58:59], 16, v[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v57 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; VI-NEXT: v_lshrrev_b64 v[56:57], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v47 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_lshrrev_b64 v[46:47], 16, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[31:32] +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v45 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[30:31] +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[29:30] +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v49 +; VI-NEXT: v_lshrrev_b64 v[31:32], 16, v[28:29] +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v42 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[27:28] +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 +; VI-NEXT: v_lshrrev_b64 v[41:42], 16, v[26:27] +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v40 +; VI-NEXT: v_lshrrev_b64 v[26:27], 16, v[25:26] +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v55 +; VI-NEXT: v_lshrrev_b64 v[39:40], 16, v[24:25] +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v54 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[23:24] +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; VI-NEXT: v_lshrrev_b64 v[53:54], 16, v[22:23] +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v52 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[21:22] +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v51 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; VI-NEXT: v_lshrrev_b64 v[51:52], 16, v[20:21] +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v50 +; VI-NEXT: v_lshrrev_b64 v[49:50], 16, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[19:20] +; VI-NEXT: v_mov_b32_e32 v3, v56 +; VI-NEXT: v_mov_b32_e32 v5, v58 +; VI-NEXT: v_mov_b32_e32 v7, v60 +; VI-NEXT: v_mov_b32_e32 v9, v62 +; VI-NEXT: v_mov_b32_e32 v11, v38 +; VI-NEXT: v_mov_b32_e32 v13, v37 +; VI-NEXT: v_mov_b32_e32 v15, v36 +; VI-NEXT: v_mov_b32_e32 v21, v51 +; VI-NEXT: v_mov_b32_e32 v23, v53 +; VI-NEXT: v_mov_b32_e32 v25, v39 +; VI-NEXT: v_mov_b32_e32 v27, v41 +; VI-NEXT: v_mov_b32_e32 v29, v31 +; VI-NEXT: v_mov_b32_e32 v31, v14 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[32:33], 16, v[33:34] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[33:34], 16, v[16:17] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[15:16] +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[18:19] +; VI-NEXT: v_mov_b32_e32 v1, v46 +; VI-NEXT: v_mov_b32_e32 v17, v33 +; VI-NEXT: v_mov_b32_e32 v19, v49 ; VI-NEXT: .LBB105_3: ; %end -; VI-NEXT: v_mov_b32_e32 v13, v18 -; VI-NEXT: v_mov_b32_e32 v18, v40 +; VI-NEXT: v_mov_b32_e32 v14, v43 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -235471,20 +234941,7 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v1, v39 -; VI-NEXT: v_mov_b32_e32 v3, v38 -; VI-NEXT: v_mov_b32_e32 v5, v37 -; VI-NEXT: v_mov_b32_e32 v7, v36 -; VI-NEXT: v_mov_b32_e32 v9, v35 -; VI-NEXT: v_mov_b32_e32 v11, v34 -; VI-NEXT: v_mov_b32_e32 v15, v55 -; VI-NEXT: v_mov_b32_e32 v17, v54 -; VI-NEXT: v_mov_b32_e32 v19, v53 -; VI-NEXT: v_mov_b32_e32 v21, v52 -; VI-NEXT: v_mov_b32_e32 v23, v51 -; VI-NEXT: v_mov_b32_e32 v25, v50 -; VI-NEXT: v_mov_b32_e32 v27, v49 -; VI-NEXT: v_mov_b32_e32 v29, v48 +; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB105_4: @@ -237720,22 +237177,21 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v9, v52, v1 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v11, v51, v1 +; SI-NEXT: s_mov_b32 s6, 0x30000 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_or_b32_e32 v3, v55, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v9, v52, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v3 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v40 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_mov_b32 s6, 0x30000 -; SI-NEXT: v_or_b32_e32 v3, v51, v3 -; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v3 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 @@ -237787,12 +237243,12 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: v_or_b32_e32 v3, v31, v3 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v18, v11 ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload @@ -237814,26 +237270,26 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v30, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v11, v16, v11 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v28, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v14, v14, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v3, v28, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill @@ -238753,21 +238209,19 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB107_3 ; SI-NEXT: .LBB107_2: -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: s_mov_b32 s79, s23 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: s_mov_b32 s77, s18 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: s_mov_b32 s18, s75 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr63 @@ -238823,11 +238277,13 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: ; implicit-def: $sgpr19 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: .LBB107_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_mov_b32 s4, s60 @@ -240780,6 +240236,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-LABEL: bitcast_v64f16_to_v64i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -240798,739 +240256,821 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v30 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v61, s28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v62, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v33 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v33, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v56 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s29 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v27 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v37, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s29 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s18 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v15, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 -; SI-NEXT: s_cbranch_scc0 .LBB109_2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, s24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_branch .LBB109_3 -; SI-NEXT: .LBB109_2: -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: .LBB109_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v8, v3 -; SI-NEXT: s_cbranch_vccnz .LBB109_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_mov_b32_e32 v47, v38 +; SI-NEXT: s_cbranch_execnz .LBB109_3 +; SI-NEXT: .LBB109_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_mov_b32_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_or_b32_e32 v54, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v6, v24 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v39, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_mov_b32_e32 v41, v23 +; SI-NEXT: v_or_b32_e32 v56, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: v_mov_b32_e32 v55, v19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v36, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_mov_b32_e32 v45, v35 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 -; SI-NEXT: v_mov_b32_e32 v57, v33 -; SI-NEXT: v_or_b32_e32 v34, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_or_b32_e32 v22, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 -; SI-NEXT: v_or_b32_e32 v32, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v15, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v9, v31 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshr_b64 v[58:59], v[33:34], 16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: v_or_b32_e32 v32, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_or_b32_e32 v11, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_or_b32_e32 v20, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v39, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v30, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_or_b32_e32 v58, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: v_mov_b32_e32 v12, v42 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v61, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v35, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v26, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v37, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v27, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v44, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_or_b32_e32 v46, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_or_b32_e32 v29, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v59, v3, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v22, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v5, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v18, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v30, v3, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v5, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v16, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v14, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v60 -; SI-NEXT: v_or_b32_e32 v43, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v60 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_or_b32_e32 v23, v5, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 -; SI-NEXT: v_or_b32_e32 v10, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v3 +; SI-NEXT: v_or_b32_e32 v52, v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_or_b32_e32 v41, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_or_b32_e32 v51, v3, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_lshr_b64 v[62:63], v[38:39], 16 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v51, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 -; SI-NEXT: v_or_b32_e32 v4, v3, v4 -; SI-NEXT: v_mov_b32_e32 v63, v51 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v45 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; SI-NEXT: v_or_b32_e32 v63, v5, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 -; SI-NEXT: v_or_b32_e32 v44, v28, v33 -; SI-NEXT: v_lshr_b64 v[46:47], v[29:30], 16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v62, v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_or_b32_e32 v7, v3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v49, v24, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_lshr_b64 v[3:4], v[45:46], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v48, v21 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_or_b32_e32 v2, v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v52 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v52, v20, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v61 -; SI-NEXT: v_or_b32_e32 v61, v24, v29 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v37, v20, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 -; SI-NEXT: v_or_b32_e32 v12, v28, v25 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v5, v1, v55 +; SI-NEXT: v_lshr_b64 v[1:2], v[43:44], 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v48, v47 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_mov_b32_e32 v2, v27 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v12 -; SI-NEXT: v_or_b32_e32 v12, v20, v21 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, v58 +; SI-NEXT: v_mov_b32_e32 v43, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v48 +; SI-NEXT: v_or_b32_e32 v6, v40, v53 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 -; SI-NEXT: v_or_b32_e32 v12, v24, v17 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v8, v37 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v28, v15 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v8, v35 +; SI-NEXT: v_lshr_b64 v[47:48], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v36, v53 +; SI-NEXT: v_mov_b32_e32 v37, v54 +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v12 -; SI-NEXT: v_or_b32_e32 v12, v20, v13 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[47:48], v[34:35], 16 +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v24, v42 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v20, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v54 +; SI-NEXT: v_lshr_b64 v[47:48], v[60:61], 16 +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v28, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[56:57], v[31:32], 16 -; SI-NEXT: v_or_b32_e32 v54, v20, v40 -; SI-NEXT: v_or_b32_e32 v20, v24, v5 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v31, v55 -; SI-NEXT: v_lshr_b64 v[54:55], v[15:16], 16 -; SI-NEXT: v_mov_b32_e32 v15, v20 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 -; SI-NEXT: v_or_b32_e32 v8, v28, v3 -; SI-NEXT: v_lshr_b64 v[28:29], v[5:6], 16 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_lshr_b64 v[47:48], v[57:58], 16 +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v50, v1 -; SI-NEXT: v_lshr_b64 v[49:50], v[35:36], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[42:43], 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[47:48], v[38:39], 16 +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[20:21], v[9:10], 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v35, v44 -; SI-NEXT: v_lshr_b64 v[44:45], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[17:18], 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[20:21], v[40:41], 16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16 -; SI-NEXT: v_mov_b32_e32 v42, v61 -; SI-NEXT: v_mov_b32_e32 v61, v37 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v51, v43 -; SI-NEXT: .LBB109_5: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshr_b64 v[47:48], v[19:20], 16 +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[47:48], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[28:29], 16 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[26:27], v[26:27], 16 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: v_mov_b32_e32 v14, v23 +; SI-NEXT: v_mov_b32_e32 v10, v24 +; SI-NEXT: v_lshr_b64 v[23:24], v[31:32], 16 +; SI-NEXT: v_mov_b32_e32 v45, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v61 +; SI-NEXT: v_mov_b32_e32 v2, v39 +; SI-NEXT: v_lshr_b64 v[47:48], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[55:56], 16 +; SI-NEXT: v_mov_b32_e32 v15, v25 +; SI-NEXT: v_lshr_b64 v[24:25], v[53:54], 16 +; SI-NEXT: v_mov_b32_e32 v28, v12 +; SI-NEXT: v_mov_b32_e32 v35, v20 +; SI-NEXT: v_mov_b32_e32 v12, v11 +; SI-NEXT: v_mov_b32_e32 v31, v17 +; SI-NEXT: v_mov_b32_e32 v53, v23 +; SI-NEXT: v_mov_b32_e32 v23, v47 +; SI-NEXT: v_mov_b32_e32 v48, v3 +; SI-NEXT: v_mov_b32_e32 v34, v26 +; SI-NEXT: v_mov_b32_e32 v19, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[8:9], v[36:37], 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v8, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v36, v1 +; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -241552,6 +241092,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB109_4: +; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v64f16_to_v64i16_scalar: ; VI: ; %bb.0: @@ -243229,273 +242771,294 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 -; SI-NEXT: v_mov_b32_e32 v42, v4 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v42, v11 +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v41 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB111_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s29 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s27 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, s29 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, v23 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v4 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 -; SI-NEXT: v_mov_b32_e32 v2, v9 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s20 -; SI-NEXT: v_mov_b32_e32 v3, v10 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v5, v27 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v6 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v6 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v59, s25 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v22 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v8 ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v8, v19 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v9 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s24 -; SI-NEXT: v_mov_b32_e32 v24, v43 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v9, v20 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v10 ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s25 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 -; SI-NEXT: v_mov_b32_e32 v61, v30 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v10, v21 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 +; SI-NEXT: v_mov_b32_e32 v3, v24 +; SI-NEXT: v_mov_b32_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v12 +; SI-NEXT: v_mov_b32_e32 v4, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_mov_b32_e32 v60, v28 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v28 +; SI-NEXT: v_mov_b32_e32 v28, v43 +; SI-NEXT: v_mov_b32_e32 v6, v29 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v14 +; SI-NEXT: v_mov_b32_e32 v61, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v48 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 -; SI-NEXT: v_mov_b32_e32 v25, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v62 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v23 -; SI-NEXT: v_mov_b32_e32 v23, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: v_mov_b32_e32 v29, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v31 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v27 +; SI-NEXT: v_mov_b32_e32 v27, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v34 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 +; SI-NEXT: v_mov_b32_e32 v30, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_branch .LBB111_3 ; SI-NEXT: .LBB111_2: -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr41 ; SI-NEXT: v_mov_b32_e32 v61, v30 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v6, v29 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: v_mov_b32_e32 v3, v10 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: v_mov_b32_e32 v60, v28 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: v_mov_b32_e32 v2, v9 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: v_mov_b32_e32 v5, v27 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: v_mov_b32_e32 v4, v26 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v7, v25 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; kill: killed $vgpr41 +; SI-NEXT: v_mov_b32_e32 v3, v24 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_mov_b32_e32 v2, v23 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: v_mov_b32_e32 v1, v22 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v10, v21 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: v_mov_b32_e32 v9, v20 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: v_mov_b32_e32 v8, v19 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; kill: killed $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 ; SI-NEXT: ; kill: killed $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: .LBB111_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) @@ -243503,262 +243066,251 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v42, v44 ; SI-NEXT: v_mov_b32_e32 v44, v46 ; SI-NEXT: v_mov_b32_e32 v46, v56 -; SI-NEXT: v_mov_b32_e32 v56, v58 -; SI-NEXT: v_mov_b32_e32 v58, v5 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: v_mov_b32_e32 v9, v11 +; SI-NEXT: v_mov_b32_e32 v56, v11 ; SI-NEXT: v_mov_b32_e32 v11, v13 ; SI-NEXT: v_mov_b32_e32 v13, v15 ; SI-NEXT: v_mov_b32_e32 v15, v17 ; SI-NEXT: v_mov_b32_e32 v17, v19 -; SI-NEXT: v_mov_b32_e32 v19, v1 +; SI-NEXT: v_mov_b32_e32 v19, v21 +; SI-NEXT: v_mov_b32_e32 v21, v23 +; SI-NEXT: v_mov_b32_e32 v23, v58 ; SI-NEXT: s_cbranch_vccnz .LBB111_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v2 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s16 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s17 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s18 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 ; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s20 -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 ; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s21 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 ; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s22 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 ; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 ; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, s24 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 ; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, s25 ; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 ; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v62 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v63 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v60 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s25 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v15, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v16, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v18, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v20, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s26 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: .LBB111_5: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -243768,8 +243320,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -243779,8 +243331,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -243790,7 +243342,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -243799,34 +243351,32 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v28 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -243836,8 +243386,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -243847,8 +243397,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -243858,8 +243408,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -243869,8 +243419,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -243880,8 +243430,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -243891,8 +243441,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -243902,8 +243452,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -243913,8 +243463,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -243924,8 +243474,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -243935,77 +243485,85 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -244032,7 +243590,7 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 231460f584a2e..9d397051a7dd4 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -8197,15 +8197,15 @@ define inreg <16 x i8> @bitcast_v4f32_to_v16i8_scalar(<4 x float> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB49_4 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v21, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s16, 1.0 ; SI-NEXT: v_add_f32_e64 v19, s19, 1.0 ; SI-NEXT: v_add_f32_e64 v18, s18, 1.0 -; SI-NEXT: v_lshr_b64 v[0:1], v[20:21], 16 +; SI-NEXT: v_add_f32_e64 v21, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s16, 1.0 ; SI-NEXT: v_lshr_b64 v[11:12], v[18:19], 24 ; SI-NEXT: v_lshr_b64 v[16:17], v[18:19], 16 ; SI-NEXT: v_lshr_b64 v[9:10], v[18:19], 8 ; SI-NEXT: v_lshr_b64 v[3:4], v[20:21], 24 +; SI-NEXT: v_lshr_b64 v[12:13], v[20:21], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 8 ; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v19 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 @@ -8243,11 +8243,11 @@ define inreg <16 x i8> @bitcast_v4f32_to_v16i8_scalar(<4 x float> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v16, s12 ; SI-NEXT: v_mov_b32_e32 v9, s14 ; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v12, s6 ; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: .LBB49_5: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_mov_b32_e32 v0, v20 +; SI-NEXT: v_mov_b32_e32 v2, v12 ; SI-NEXT: v_mov_b32_e32 v4, v21 ; SI-NEXT: v_mov_b32_e32 v8, v18 ; SI-NEXT: v_mov_b32_e32 v10, v16 @@ -16158,20 +16158,20 @@ define inreg <16 x i8> @bitcast_v2f64_to_v16i8_scalar(<2 x double> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB85_4 ; SI-NEXT: .LBB85_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[20:21], s[16:17], 1.0 ; SI-NEXT: v_add_f64 v[18:19], s[18:19], 1.0 -; SI-NEXT: v_lshr_b64 v[0:1], v[20:21], 16 +; SI-NEXT: v_add_f64 v[22:23], s[16:17], 1.0 ; SI-NEXT: v_lshr_b64 v[11:12], v[18:19], 24 ; SI-NEXT: v_lshr_b64 v[16:17], v[18:19], 16 ; SI-NEXT: v_lshr_b64 v[9:10], v[18:19], 8 -; SI-NEXT: v_lshr_b64 v[3:4], v[20:21], 24 -; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 8 +; SI-NEXT: v_lshr_b64 v[3:4], v[22:23], 24 +; SI-NEXT: v_lshr_b64 v[20:21], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[22:23], 8 ; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v19 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v21 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v23 ; SI-NEXT: s_branch .LBB85_5 ; SI-NEXT: .LBB85_3: ; SI-NEXT: ; implicit-def: $sgpr14 @@ -16189,8 +16189,8 @@ define inreg <16 x i8> @bitcast_v2f64_to_v16i8_scalar(<2 x double> inreg %a, i32 ; SI-NEXT: s_branch .LBB85_2 ; SI-NEXT: .LBB85_4: ; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_mov_b32_e32 v21, s17 -; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v22, s16 ; SI-NEXT: v_mov_b32_e32 v18, s18 ; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: v_mov_b32_e32 v14, s26 @@ -16199,15 +16199,15 @@ define inreg <16 x i8> @bitcast_v2f64_to_v16i8_scalar(<2 x double> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v6, s23 ; SI-NEXT: v_mov_b32_e32 v5, s22 ; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v20, s12 ; SI-NEXT: v_mov_b32_e32 v3, s10 ; SI-NEXT: v_mov_b32_e32 v9, s8 ; SI-NEXT: v_mov_b32_e32 v16, s6 ; SI-NEXT: v_mov_b32_e32 v11, s4 ; SI-NEXT: .LBB85_5: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: v_mov_b32_e32 v0, v20 -; SI-NEXT: v_mov_b32_e32 v4, v21 +; SI-NEXT: v_mov_b32_e32 v0, v22 +; SI-NEXT: v_mov_b32_e32 v2, v20 +; SI-NEXT: v_mov_b32_e32 v4, v23 ; SI-NEXT: v_mov_b32_e32 v8, v18 ; SI-NEXT: v_mov_b32_e32 v10, v16 ; SI-NEXT: v_mov_b32_e32 v12, v19 @@ -18746,12 +18746,12 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v13 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -19308,10 +19308,10 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr8 ; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB96_2 @@ -22576,97 +22576,97 @@ define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v8f16_to_v16i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s22 ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB105_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; SI-NEXT: v_or_b32_e32 v19, v16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_or_b32_e32 v20, v8, v0 -; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 -; SI-NEXT: v_or_b32_e32 v17, v25, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; SI-NEXT: v_or_b32_e32 v18, v24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_or_b32_e32 v19, v8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_or_b32_e32 v17, v22, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v18, v21, v2 ; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 +; SI-NEXT: v_lshr_b64 v[11:12], v[17:18], 24 +; SI-NEXT: v_lshr_b64 v[4:5], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[17:18], 8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 ; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v18 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 -; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 -; SI-NEXT: v_lshr_b64 v[11:12], v[17:18], 24 -; SI-NEXT: v_lshr_b64 v[21:22], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[17:18], 8 ; SI-NEXT: s_cbranch_execnz .LBB105_3 ; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v17, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; SI-NEXT: v_or_b32_e32 v18, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v17, v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v18, v3, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v19, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; SI-NEXT: v_or_b32_e32 v20, v2, v0 -; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v19, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 ; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 -; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 ; SI-NEXT: v_lshr_b64 v[11:12], v[17:18], 24 -; SI-NEXT: v_lshr_b64 v[21:22], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 +; SI-NEXT: v_lshr_b64 v[12:13], v[17:18], 16 ; SI-NEXT: v_lshr_b64 v[9:10], v[17:18], 8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 ; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v18 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; SI-NEXT: .LBB105_3: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, v20 ; SI-NEXT: v_mov_b32_e32 v8, v17 -; SI-NEXT: v_mov_b32_e32 v10, v21 +; SI-NEXT: v_mov_b32_e32 v10, v12 ; SI-NEXT: v_mov_b32_e32 v12, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB105_4: ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 @@ -24555,93 +24555,92 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: v_mul_f32_e64 v28, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v27, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v30, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 ; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v13, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v30 -; SI-NEXT: v_lshr_b64 v[19:20], v[8:9], 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshr_b64 v[22:23], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_lshr_b64 v[24:25], v[8:9], 16 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v29 -; SI-NEXT: v_lshr_b64 v[20:21], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[23:24], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[10:11], v[22:23], 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 +; SI-NEXT: v_lshr_b64 v[21:22], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v27 +; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[24:25], 8 +; SI-NEXT: v_lshr_b64 v[3:4], v[21:22], 24 +; SI-NEXT: v_lshr_b64 v[19:20], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[21:22], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 24 +; SI-NEXT: v_lshr_b64 v[17:18], v[24:25], 16 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v27 ; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v29 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v23 -; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 -; SI-NEXT: v_lshr_b64 v[17:18], v[22:23], 24 -; SI-NEXT: v_lshr_b64 v[11:12], v[22:23], 8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v22 +; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v25 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[22:23], v[0:1], 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[24:25], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[19:20], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[0:1], 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_lshr_b64 v[20:21], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[23:24], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[10:11], v[22:23], 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 -; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 -; SI-NEXT: v_lshr_b64 v[17:18], v[22:23], 24 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; SI-NEXT: v_lshr_b64 v[11:12], v[22:23], 8 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v23 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshr_b64 v[22:23], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[21:22], 24 +; SI-NEXT: v_lshr_b64 v[19:20], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[21:22], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[24:25], 24 +; SI-NEXT: v_lshr_b64 v[17:18], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[24:25], 8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v22 +; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v25 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v8 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: v_mov_b32_e32 v0, v19 -; SI-NEXT: v_mov_b32_e32 v4, v20 -; SI-NEXT: v_mov_b32_e32 v5, v9 -; SI-NEXT: v_mov_b32_e32 v8, v22 -; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: v_mov_b32_e32 v11, v17 -; SI-NEXT: v_mov_b32_e32 v12, v23 +; SI-NEXT: v_mov_b32_e32 v0, v21 +; SI-NEXT: v_mov_b32_e32 v2, v19 +; SI-NEXT: v_mov_b32_e32 v4, v22 +; SI-NEXT: v_mov_b32_e32 v5, v10 +; SI-NEXT: v_mov_b32_e32 v8, v24 +; SI-NEXT: v_mov_b32_e32 v10, v17 +; SI-NEXT: v_mov_b32_e32 v12, v25 ; SI-NEXT: v_mov_b32_e32 v13, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v8bf16_to_v16i8_scalar: @@ -24750,7 +24749,7 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: s_branch .LBB109_5 ; VI-NEXT: .LBB109_3: ; VI-NEXT: ; implicit-def: $sgpr13 @@ -24777,7 +24776,7 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v14, s20 ; VI-NEXT: v_mov_b32_e32 v13, s15 ; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_mov_b32_e32 v11, s13 +; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_mov_b32_e32 v7, s12 ; VI-NEXT: v_mov_b32_e32 v6, s11 ; VI-NEXT: v_mov_b32_e32 v5, s10 @@ -24785,7 +24784,6 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v17, s4 ; VI-NEXT: .LBB109_5: ; %end ; VI-NEXT: v_mov_b32_e32 v3, v17 -; VI-NEXT: v_mov_b32_e32 v1, v11 ; VI-NEXT: v_mov_b32_e32 v11, v16 ; VI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll index e3b374b712717..50bed96e758c1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll @@ -3968,7 +3968,7 @@ define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i ; SI-LABEL: bitcast_v14f16_to_v14i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v14, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 @@ -3983,70 +3983,71 @@ define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_or_b32_e32 v10, v10, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_or_b32_e32 v6, v6, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v2, v2, v14 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[14:15], v[1:2], 16 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshr_b64 v[15:16], v[5:6], 16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 ; SI-NEXT: v_lshr_b64 v[16:17], v[9:10], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v12, v12, v18 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: v_mov_b32_e32 v1, v14 ; SI-NEXT: v_mov_b32_e32 v5, v15 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index 155ec568a65d3..034a52d5d519c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -2542,56 +2542,56 @@ define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -3386,18 +3386,18 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: .LBB22_4: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -4063,16 +4063,16 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 ; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 ; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 ; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 ; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 ; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 @@ -4092,15 +4092,15 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -4153,153 +4153,154 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v8 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_add_f32_e32 v1, s4, v8 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 -; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] -; VI-NEXT: v_add_f32_e32 v2, s4, v8 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 -; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v8 -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 -; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] -; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, v10 -; VI-NEXT: v_mov_b32_e32 v3, v9 -; VI-NEXT: v_mov_b32_e32 v5, v8 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 16, v[11:12] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v12, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[12:13] +; VI-NEXT: v_mov_b32_e32 v1, v11 +; VI-NEXT: v_mov_b32_e32 v3, v10 +; VI-NEXT: v_mov_b32_e32 v5, v9 +; VI-NEXT: v_mov_b32_e32 v7, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -4993,221 +4994,225 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { ; VI-LABEL: bitcast_v8i32_to_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v35, v5 +; VI-NEXT: v_mov_b32_e32 v34, v4 +; VI-NEXT: v_mov_b32_e32 v37, v3 +; VI-NEXT: v_mov_b32_e32 v36, v2 +; VI-NEXT: v_mov_b32_e32 v39, v1 +; VI-NEXT: v_mov_b32_e32 v38, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; VI-NEXT: .LBB24_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB24_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v39, vcc, 3, v39 +; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v38 +; VI-NEXT: v_add_u32_e32 v37, vcc, 3, v37 +; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v36 ; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v35 ; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v34 ; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v33 ; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; VI-NEXT: .LBB24_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mov_b32_e32 v8, v34 -; VI-NEXT: v_mov_b32_e32 v12, v35 -; VI-NEXT: v_mov_b32_e32 v16, v32 -; VI-NEXT: v_mov_b32_e32 v20, v33 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v28, v7 -; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v6, v37 -; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: v_mov_b32_e32 v0, v38 +; VI-NEXT: v_mov_b32_e32 v4, v39 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v12, v37 +; VI-NEXT: v_mov_b32_e32 v16, v34 +; VI-NEXT: v_mov_b32_e32 v20, v35 +; VI-NEXT: v_mov_b32_e32 v24, v32 +; VI-NEXT: v_mov_b32_e32 v28, v33 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v8i32_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 -; GFX9-NEXT: v_mov_b32_e32 v35, v3 -; GFX9-NEXT: v_mov_b32_e32 v34, v2 +; GFX9-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-NEXT: v_mov_b32_e32 v35, v5 +; GFX9-NEXT: v_mov_b32_e32 v34, v4 +; GFX9-NEXT: v_mov_b32_e32 v37, v3 +; GFX9-NEXT: v_mov_b32_e32 v36, v2 +; GFX9-NEXT: v_mov_b32_e32 v39, v1 +; GFX9-NEXT: v_mov_b32_e32 v38, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr14 ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr3 -; GFX9-NEXT: ; implicit-def: $vgpr11 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: .LBB24_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB24_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v39, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v38, 3, v38 +; GFX9-NEXT: v_add_u32_e32 v37, 3, v37 +; GFX9-NEXT: v_add_u32_e32 v36, 3, v36 ; GFX9-NEXT: v_add_u32_e32 v35, 3, v35 ; GFX9-NEXT: v_add_u32_e32 v34, 3, v34 ; GFX9-NEXT: v_add_u32_e32 v33, 3, v33 ; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 -; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: .LBB24_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v34 -; GFX9-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-NEXT: v_mov_b32_e32 v16, v32 -; GFX9-NEXT: v_mov_b32_e32 v20, v33 -; GFX9-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, v38 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: v_mov_b32_e32 v0, v38 +; GFX9-NEXT: v_mov_b32_e32 v4, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v12, v37 +; GFX9-NEXT: v_mov_b32_e32 v16, v34 +; GFX9-NEXT: v_mov_b32_e32 v20, v35 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v8i32_to_v32i8: @@ -6261,14 +6266,14 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v7, 0x300 -; VI-NEXT: v_add_u16_e32 v2, 3, v33 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_add_u16_e32 v1, 0x300, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v33 ; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 ; VI-NEXT: v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v8 @@ -6913,42 +6918,6 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v17 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 -; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v23, v3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -6966,19 +6935,55 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_or_b32 s6, s5, s6 +; SI-NEXT: s_and_b32 s5, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s7, s5, s7 +; SI-NEXT: s_and_b32 s5, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v3, s5, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s7 ; SI-NEXT: s_cbranch_execnz .LBB27_3 ; SI-NEXT: .LBB27_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -7105,24 +7110,6 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 ; VI-NEXT: s_cbranch_scc0 .LBB27_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -7140,19 +7127,37 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_or_b32 s6, s5, s6 +; VI-NEXT: s_and_b32 s5, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s7, s5, s7 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s5, v0 +; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: s_cbranch_execnz .LBB27_3 ; VI-NEXT: .LBB27_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 @@ -7261,24 +7266,6 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v17 ; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s29, 8 -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -7296,19 +7283,37 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_or_b32 s6, s5, s6 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s7, s5, s7 +; GFX9-NEXT: s_and_b32 s5, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_cbranch_execnz .LBB27_3 ; GFX9-NEXT: .LBB27_2: ; %cmp.true ; GFX9-NEXT: s_add_i32 s16, s16, 3 @@ -9697,56 +9702,56 @@ define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: .LBB42_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -10543,18 +10548,18 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -11220,16 +11225,16 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 ; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 ; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 ; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 ; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 ; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 @@ -11249,15 +11254,15 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -11310,153 +11315,154 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v8 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_add_f32_e32 v1, s4, v8 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 -; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] -; VI-NEXT: v_add_f32_e32 v2, s4, v8 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 -; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v8 -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 -; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] -; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, v10 -; VI-NEXT: v_mov_b32_e32 v3, v9 -; VI-NEXT: v_mov_b32_e32 v5, v8 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 16, v[11:12] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v12, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[12:13] +; VI-NEXT: v_mov_b32_e32 v1, v11 +; VI-NEXT: v_mov_b32_e32 v3, v10 +; VI-NEXT: v_mov_b32_e32 v5, v9 +; VI-NEXT: v_mov_b32_e32 v7, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -12150,221 +12156,225 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { ; VI-LABEL: bitcast_v8f32_to_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v35, v5 +; VI-NEXT: v_mov_b32_e32 v34, v4 +; VI-NEXT: v_mov_b32_e32 v37, v3 +; VI-NEXT: v_mov_b32_e32 v36, v2 +; VI-NEXT: v_mov_b32_e32 v39, v1 +; VI-NEXT: v_mov_b32_e32 v38, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB48_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v39, 1.0, v39 +; VI-NEXT: v_add_f32_e32 v38, 1.0, v38 +; VI-NEXT: v_add_f32_e32 v37, 1.0, v37 +; VI-NEXT: v_add_f32_e32 v36, 1.0, v36 ; VI-NEXT: v_add_f32_e32 v35, 1.0, v35 ; VI-NEXT: v_add_f32_e32 v34, 1.0, v34 ; VI-NEXT: v_add_f32_e32 v33, 1.0, v33 ; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 -; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mov_b32_e32 v8, v34 -; VI-NEXT: v_mov_b32_e32 v12, v35 -; VI-NEXT: v_mov_b32_e32 v16, v32 -; VI-NEXT: v_mov_b32_e32 v20, v33 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v28, v7 -; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v6, v37 -; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: v_mov_b32_e32 v0, v38 +; VI-NEXT: v_mov_b32_e32 v4, v39 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v12, v37 +; VI-NEXT: v_mov_b32_e32 v16, v34 +; VI-NEXT: v_mov_b32_e32 v20, v35 +; VI-NEXT: v_mov_b32_e32 v24, v32 +; VI-NEXT: v_mov_b32_e32 v28, v33 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v8f32_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 -; GFX9-NEXT: v_mov_b32_e32 v35, v3 -; GFX9-NEXT: v_mov_b32_e32 v34, v2 +; GFX9-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-NEXT: v_mov_b32_e32 v35, v5 +; GFX9-NEXT: v_mov_b32_e32 v34, v4 +; GFX9-NEXT: v_mov_b32_e32 v37, v3 +; GFX9-NEXT: v_mov_b32_e32 v36, v2 +; GFX9-NEXT: v_mov_b32_e32 v39, v1 +; GFX9-NEXT: v_mov_b32_e32 v38, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr14 ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr3 -; GFX9-NEXT: ; implicit-def: $vgpr11 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB48_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v39, 1.0, v39 +; GFX9-NEXT: v_add_f32_e32 v38, 1.0, v38 +; GFX9-NEXT: v_add_f32_e32 v37, 1.0, v37 +; GFX9-NEXT: v_add_f32_e32 v36, 1.0, v36 ; GFX9-NEXT: v_add_f32_e32 v35, 1.0, v35 ; GFX9-NEXT: v_add_f32_e32 v34, 1.0, v34 ; GFX9-NEXT: v_add_f32_e32 v33, 1.0, v33 ; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 -; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v34 -; GFX9-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-NEXT: v_mov_b32_e32 v16, v32 -; GFX9-NEXT: v_mov_b32_e32 v20, v33 -; GFX9-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, v38 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: v_mov_b32_e32 v0, v38 +; GFX9-NEXT: v_mov_b32_e32 v4, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v12, v37 +; GFX9-NEXT: v_mov_b32_e32 v16, v34 +; GFX9-NEXT: v_mov_b32_e32 v20, v35 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v8f32_to_v32i8: @@ -12616,38 +12626,38 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB49_4 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v39, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v38, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v49, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v48, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v35, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v34, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v39, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v38, s18, 1.0 ; SI-NEXT: v_add_f32_e64 v37, s21, 1.0 ; SI-NEXT: v_add_f32_e64 v36, s20, 1.0 -; SI-NEXT: v_lshr_b64 v[27:28], v[48:49], 24 -; SI-NEXT: v_lshr_b64 v[24:25], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[0:1], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[48:49], 8 +; SI-NEXT: v_add_f32_e64 v35, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v34, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v49, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v48, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[27:28], v[34:35], 24 ; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 24 -; SI-NEXT: v_lshr_b64 v[32:33], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[38:39], 24 +; SI-NEXT: v_lshr_b64 v[32:33], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[34:35], 8 +; SI-NEXT: v_lshr_b64 v[28:29], v[36:37], 16 ; SI-NEXT: v_lshr_b64 v[17:18], v[36:37], 8 -; SI-NEXT: v_lshr_b64 v[11:12], v[34:35], 24 -; SI-NEXT: v_lshr_b64 v[28:29], v[34:35], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[34:35], 8 -; SI-NEXT: v_lshr_b64 v[3:4], v[38:39], 24 -; SI-NEXT: v_lshr_b64 v[1:2], v[38:39], 8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v49 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v49 +; SI-NEXT: v_lshr_b64 v[20:21], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 8 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 +; SI-NEXT: v_lshr_b64 v[12:13], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v35 ; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 ; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v37 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v39 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v39 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v49 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 ; SI-NEXT: s_branch .LBB49_5 ; SI-NEXT: .LBB49_3: ; SI-NEXT: ; implicit-def: $sgpr8 @@ -12676,14 +12686,14 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: s_branch .LBB49_2 ; SI-NEXT: .LBB49_4: -; SI-NEXT: v_mov_b32_e32 v38, s16 -; SI-NEXT: v_mov_b32_e32 v39, s17 -; SI-NEXT: v_mov_b32_e32 v34, s18 -; SI-NEXT: v_mov_b32_e32 v35, s19 +; SI-NEXT: v_mov_b32_e32 v48, s16 +; SI-NEXT: v_mov_b32_e32 v49, s17 +; SI-NEXT: v_mov_b32_e32 v38, s18 +; SI-NEXT: v_mov_b32_e32 v39, s19 ; SI-NEXT: v_mov_b32_e32 v36, s20 ; SI-NEXT: v_mov_b32_e32 v37, s21 -; SI-NEXT: v_mov_b32_e32 v48, s22 -; SI-NEXT: v_mov_b32_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v34, s22 +; SI-NEXT: v_mov_b32_e32 v35, s23 ; SI-NEXT: v_mov_b32_e32 v5, s59 ; SI-NEXT: v_mov_b32_e32 v6, s57 ; SI-NEXT: v_mov_b32_e32 v7, s56 @@ -12697,30 +12707,30 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v30, s74 ; SI-NEXT: v_mov_b32_e32 v31, s72 ; SI-NEXT: v_mov_b32_e32 v27, s40 -; SI-NEXT: v_mov_b32_e32 v24, s42 +; SI-NEXT: v_mov_b32_e32 v32, s42 ; SI-NEXT: v_mov_b32_e32 v25, s44 ; SI-NEXT: v_mov_b32_e32 v19, s24 -; SI-NEXT: v_mov_b32_e32 v32, s26 +; SI-NEXT: v_mov_b32_e32 v28, s26 ; SI-NEXT: v_mov_b32_e32 v17, s28 ; SI-NEXT: v_mov_b32_e32 v11, s10 -; SI-NEXT: v_mov_b32_e32 v28, s12 +; SI-NEXT: v_mov_b32_e32 v20, s12 ; SI-NEXT: v_mov_b32_e32 v9, s14 ; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v12, s6 ; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: .LBB49_5: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: v_mov_b32_e32 v0, v38 -; SI-NEXT: v_mov_b32_e32 v4, v39 -; SI-NEXT: v_mov_b32_e32 v10, v28 -; SI-NEXT: v_mov_b32_e32 v8, v34 -; SI-NEXT: v_mov_b32_e32 v12, v35 -; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: v_mov_b32_e32 v0, v48 +; SI-NEXT: v_mov_b32_e32 v2, v12 +; SI-NEXT: v_mov_b32_e32 v4, v49 +; SI-NEXT: v_mov_b32_e32 v8, v38 +; SI-NEXT: v_mov_b32_e32 v10, v20 +; SI-NEXT: v_mov_b32_e32 v12, v39 ; SI-NEXT: v_mov_b32_e32 v16, v36 +; SI-NEXT: v_mov_b32_e32 v18, v28 ; SI-NEXT: v_mov_b32_e32 v20, v37 -; SI-NEXT: v_mov_b32_e32 v26, v24 -; SI-NEXT: v_mov_b32_e32 v24, v48 -; SI-NEXT: v_mov_b32_e32 v28, v49 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v26, v32 +; SI-NEXT: v_mov_b32_e32 v28, v35 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v32i8_scalar: @@ -12755,38 +12765,38 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB49_4 ; VI-NEXT: .LBB49_2: ; %cmp.true -; VI-NEXT: v_add_f32_e64 v1, s17, 1.0 -; VI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; VI-NEXT: v_add_f32_e64 v9, s19, 1.0 -; VI-NEXT: v_add_f32_e64 v8, s18, 1.0 -; VI-NEXT: v_add_f32_e64 v17, s21, 1.0 -; VI-NEXT: v_add_f32_e64 v16, s20, 1.0 -; VI-NEXT: v_add_f32_e64 v25, s23, 1.0 -; VI-NEXT: v_add_f32_e64 v24, s22, 1.0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v25 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v25 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v24 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; VI-NEXT: v_add_f32_e64 v39, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v38, s16, 1.0 +; VI-NEXT: v_add_f32_e64 v37, s19, 1.0 +; VI-NEXT: v_add_f32_e64 v36, s18, 1.0 +; VI-NEXT: v_add_f32_e64 v35, s21, 1.0 +; VI-NEXT: v_add_f32_e64 v34, s20, 1.0 +; VI-NEXT: v_add_f32_e64 v33, s23, 1.0 +; VI-NEXT: v_add_f32_e64 v32, s22, 1.0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; VI-NEXT: s_branch .LBB49_5 ; VI-NEXT: .LBB49_3: ; VI-NEXT: ; implicit-def: $sgpr59 @@ -12815,30 +12825,30 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr14 ; VI-NEXT: s_branch .LBB49_2 ; VI-NEXT: .LBB49_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v8, s18 -; VI-NEXT: v_mov_b32_e32 v9, s19 -; VI-NEXT: v_mov_b32_e32 v16, s20 -; VI-NEXT: v_mov_b32_e32 v17, s21 -; VI-NEXT: v_mov_b32_e32 v24, s22 -; VI-NEXT: v_mov_b32_e32 v25, s23 -; VI-NEXT: v_mov_b32_e32 v35, s59 +; VI-NEXT: v_mov_b32_e32 v38, s16 +; VI-NEXT: v_mov_b32_e32 v39, s17 +; VI-NEXT: v_mov_b32_e32 v36, s18 +; VI-NEXT: v_mov_b32_e32 v37, s19 +; VI-NEXT: v_mov_b32_e32 v34, s20 +; VI-NEXT: v_mov_b32_e32 v35, s21 +; VI-NEXT: v_mov_b32_e32 v32, s22 +; VI-NEXT: v_mov_b32_e32 v33, s23 +; VI-NEXT: v_mov_b32_e32 v1, s59 ; VI-NEXT: v_mov_b32_e32 v2, s57 ; VI-NEXT: v_mov_b32_e32 v5, s58 ; VI-NEXT: v_mov_b32_e32 v6, s56 ; VI-NEXT: v_mov_b32_e32 v7, s47 -; VI-NEXT: v_mov_b32_e32 v34, s46 +; VI-NEXT: v_mov_b32_e32 v9, s46 ; VI-NEXT: v_mov_b32_e32 v10, s44 ; VI-NEXT: v_mov_b32_e32 v13, s45 ; VI-NEXT: v_mov_b32_e32 v14, s43 ; VI-NEXT: v_mov_b32_e32 v15, s42 -; VI-NEXT: v_mov_b32_e32 v33, s41 +; VI-NEXT: v_mov_b32_e32 v17, s41 ; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: v_mov_b32_e32 v21, s40 ; VI-NEXT: v_mov_b32_e32 v22, s28 ; VI-NEXT: v_mov_b32_e32 v23, s27 -; VI-NEXT: v_mov_b32_e32 v32, s26 +; VI-NEXT: v_mov_b32_e32 v25, s26 ; VI-NEXT: v_mov_b32_e32 v26, s24 ; VI-NEXT: v_mov_b32_e32 v29, s25 ; VI-NEXT: v_mov_b32_e32 v30, s15 @@ -12848,14 +12858,14 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v11, s6 ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: .LBB49_5: ; %end -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mov_b32_e32 v12, v9 -; VI-NEXT: v_mov_b32_e32 v20, v17 -; VI-NEXT: v_mov_b32_e32 v28, v25 -; VI-NEXT: v_mov_b32_e32 v1, v35 -; VI-NEXT: v_mov_b32_e32 v9, v34 -; VI-NEXT: v_mov_b32_e32 v17, v33 -; VI-NEXT: v_mov_b32_e32 v25, v32 +; VI-NEXT: v_mov_b32_e32 v0, v38 +; VI-NEXT: v_mov_b32_e32 v4, v39 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v12, v37 +; VI-NEXT: v_mov_b32_e32 v16, v34 +; VI-NEXT: v_mov_b32_e32 v20, v35 +; VI-NEXT: v_mov_b32_e32 v24, v32 +; VI-NEXT: v_mov_b32_e32 v28, v33 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v8f32_to_v32i8_scalar: @@ -12890,38 +12900,38 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB49_4 ; GFX9-NEXT: .LBB49_2: ; %cmp.true -; GFX9-NEXT: v_add_f32_e64 v1, s17, 1.0 -; GFX9-NEXT: v_add_f32_e64 v0, s16, 1.0 -; GFX9-NEXT: v_add_f32_e64 v9, s19, 1.0 -; GFX9-NEXT: v_add_f32_e64 v8, s18, 1.0 -; GFX9-NEXT: v_add_f32_e64 v17, s21, 1.0 -; GFX9-NEXT: v_add_f32_e64 v16, s20, 1.0 -; GFX9-NEXT: v_add_f32_e64 v25, s23, 1.0 -; GFX9-NEXT: v_add_f32_e64 v24, s22, 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; GFX9-NEXT: v_add_f32_e64 v39, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v38, s16, 1.0 +; GFX9-NEXT: v_add_f32_e64 v37, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v36, s18, 1.0 +; GFX9-NEXT: v_add_f32_e64 v35, s21, 1.0 +; GFX9-NEXT: v_add_f32_e64 v34, s20, 1.0 +; GFX9-NEXT: v_add_f32_e64 v33, s23, 1.0 +; GFX9-NEXT: v_add_f32_e64 v32, s22, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: s_branch .LBB49_5 ; GFX9-NEXT: .LBB49_3: ; GFX9-NEXT: ; implicit-def: $sgpr59 @@ -12950,30 +12960,30 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr14 ; GFX9-NEXT: s_branch .LBB49_2 ; GFX9-NEXT: .LBB49_4: -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v8, s18 -; GFX9-NEXT: v_mov_b32_e32 v9, s19 -; GFX9-NEXT: v_mov_b32_e32 v16, s20 -; GFX9-NEXT: v_mov_b32_e32 v17, s21 -; GFX9-NEXT: v_mov_b32_e32 v24, s22 -; GFX9-NEXT: v_mov_b32_e32 v25, s23 -; GFX9-NEXT: v_mov_b32_e32 v35, s59 +; GFX9-NEXT: v_mov_b32_e32 v38, s16 +; GFX9-NEXT: v_mov_b32_e32 v39, s17 +; GFX9-NEXT: v_mov_b32_e32 v36, s18 +; GFX9-NEXT: v_mov_b32_e32 v37, s19 +; GFX9-NEXT: v_mov_b32_e32 v34, s20 +; GFX9-NEXT: v_mov_b32_e32 v35, s21 +; GFX9-NEXT: v_mov_b32_e32 v32, s22 +; GFX9-NEXT: v_mov_b32_e32 v33, s23 +; GFX9-NEXT: v_mov_b32_e32 v1, s59 ; GFX9-NEXT: v_mov_b32_e32 v2, s57 ; GFX9-NEXT: v_mov_b32_e32 v5, s58 ; GFX9-NEXT: v_mov_b32_e32 v6, s56 ; GFX9-NEXT: v_mov_b32_e32 v7, s47 -; GFX9-NEXT: v_mov_b32_e32 v34, s46 +; GFX9-NEXT: v_mov_b32_e32 v9, s46 ; GFX9-NEXT: v_mov_b32_e32 v10, s44 ; GFX9-NEXT: v_mov_b32_e32 v13, s45 ; GFX9-NEXT: v_mov_b32_e32 v14, s43 ; GFX9-NEXT: v_mov_b32_e32 v15, s42 -; GFX9-NEXT: v_mov_b32_e32 v33, s41 +; GFX9-NEXT: v_mov_b32_e32 v17, s41 ; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_mov_b32_e32 v21, s40 ; GFX9-NEXT: v_mov_b32_e32 v22, s28 ; GFX9-NEXT: v_mov_b32_e32 v23, s27 -; GFX9-NEXT: v_mov_b32_e32 v32, s26 +; GFX9-NEXT: v_mov_b32_e32 v25, s26 ; GFX9-NEXT: v_mov_b32_e32 v26, s24 ; GFX9-NEXT: v_mov_b32_e32 v29, s25 ; GFX9-NEXT: v_mov_b32_e32 v30, s15 @@ -12983,14 +12993,14 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v11, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: .LBB49_5: ; %end -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v12, v9 -; GFX9-NEXT: v_mov_b32_e32 v20, v17 -; GFX9-NEXT: v_mov_b32_e32 v28, v25 -; GFX9-NEXT: v_mov_b32_e32 v1, v35 -; GFX9-NEXT: v_mov_b32_e32 v9, v34 -; GFX9-NEXT: v_mov_b32_e32 v17, v33 -; GFX9-NEXT: v_mov_b32_e32 v25, v32 +; GFX9-NEXT: v_mov_b32_e32 v0, v38 +; GFX9-NEXT: v_mov_b32_e32 v4, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v12, v37 +; GFX9-NEXT: v_mov_b32_e32 v16, v34 +; GFX9-NEXT: v_mov_b32_e32 v20, v35 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: bitcast_v8f32_to_v32i8_scalar: @@ -13461,14 +13471,14 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v7, 0x300 -; VI-NEXT: v_add_u16_e32 v2, 3, v33 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_add_u16_e32 v1, 0x300, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v33 ; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 ; VI-NEXT: v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v8 @@ -14113,42 +14123,6 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v17 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 -; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v23, v3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -14166,19 +14140,55 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_or_b32 s6, s5, s6 +; SI-NEXT: s_and_b32 s5, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s7, s5, s7 +; SI-NEXT: s_and_b32 s5, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v3, s5, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s7 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -14305,24 +14315,6 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -14340,19 +14332,37 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_or_b32 s6, s5, s6 +; VI-NEXT: s_and_b32 s5, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s7, s5, s7 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s5, v0 +; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 @@ -14461,24 +14471,6 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v17 ; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s29, 8 -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -14496,19 +14488,37 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_or_b32 s6, s5, s6 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s7, s5, s7 +; GFX9-NEXT: s_and_b32 s5, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_cbranch_execnz .LBB51_3 ; GFX9-NEXT: .LBB51_2: ; %cmp.true ; GFX9-NEXT: s_add_i32 s16, s16, 3 @@ -16422,56 +16432,56 @@ define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: .LBB62_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -17268,18 +17278,18 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: .LBB66_4: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -17945,16 +17955,16 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 ; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 ; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 ; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 ; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 ; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 @@ -17974,15 +17984,15 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -18035,153 +18045,154 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v8 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_add_f32_e32 v1, s4, v8 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 -; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] -; VI-NEXT: v_add_f32_e32 v2, s4, v8 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 -; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v8 -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 -; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] -; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, v10 -; VI-NEXT: v_mov_b32_e32 v3, v9 -; VI-NEXT: v_mov_b32_e32 v5, v8 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 16, v[11:12] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v12, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[12:13] +; VI-NEXT: v_mov_b32_e32 v1, v11 +; VI-NEXT: v_mov_b32_e32 v3, v10 +; VI-NEXT: v_mov_b32_e32 v5, v9 +; VI-NEXT: v_mov_b32_e32 v7, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -18875,221 +18886,225 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { ; VI-LABEL: bitcast_v4i64_to_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v35, v5 +; VI-NEXT: v_mov_b32_e32 v34, v4 +; VI-NEXT: v_mov_b32_e32 v37, v3 +; VI-NEXT: v_mov_b32_e32 v36, v2 +; VI-NEXT: v_mov_b32_e32 v39, v1 +; VI-NEXT: v_mov_b32_e32 v38, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB68_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; VI-NEXT: .LBB68_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB68_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v38, vcc, 3, v38 +; VI-NEXT: v_addc_u32_e32 v39, vcc, 0, v39, vcc +; VI-NEXT: v_add_u32_e32 v36, vcc, 3, v36 +; VI-NEXT: v_addc_u32_e32 v37, vcc, 0, v37, vcc ; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v34 ; VI-NEXT: v_addc_u32_e32 v35, vcc, 0, v35, vcc ; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 ; VI-NEXT: v_addc_u32_e32 v33, vcc, 0, v33, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; VI-NEXT: .LBB68_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mov_b32_e32 v8, v34 -; VI-NEXT: v_mov_b32_e32 v12, v35 -; VI-NEXT: v_mov_b32_e32 v16, v32 -; VI-NEXT: v_mov_b32_e32 v20, v33 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v28, v7 -; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v6, v37 -; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: v_mov_b32_e32 v0, v38 +; VI-NEXT: v_mov_b32_e32 v4, v39 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v12, v37 +; VI-NEXT: v_mov_b32_e32 v16, v34 +; VI-NEXT: v_mov_b32_e32 v20, v35 +; VI-NEXT: v_mov_b32_e32 v24, v32 +; VI-NEXT: v_mov_b32_e32 v28, v33 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v4i64_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 -; GFX9-NEXT: v_mov_b32_e32 v35, v3 -; GFX9-NEXT: v_mov_b32_e32 v34, v2 +; GFX9-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-NEXT: v_mov_b32_e32 v35, v5 +; GFX9-NEXT: v_mov_b32_e32 v34, v4 +; GFX9-NEXT: v_mov_b32_e32 v37, v3 +; GFX9-NEXT: v_mov_b32_e32 v36, v2 +; GFX9-NEXT: v_mov_b32_e32 v39, v1 +; GFX9-NEXT: v_mov_b32_e32 v38, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr14 ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr3 -; GFX9-NEXT: ; implicit-def: $vgpr11 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB68_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: .LBB68_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB68_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v38, vcc, 3, v38 +; GFX9-NEXT: v_addc_co_u32_e32 v39, vcc, 0, v39, vcc +; GFX9-NEXT: v_add_co_u32_e32 v36, vcc, 3, v36 +; GFX9-NEXT: v_addc_co_u32_e32 v37, vcc, 0, v37, vcc ; GFX9-NEXT: v_add_co_u32_e32 v34, vcc, 3, v34 ; GFX9-NEXT: v_addc_co_u32_e32 v35, vcc, 0, v35, vcc ; GFX9-NEXT: v_add_co_u32_e32 v32, vcc, 3, v32 ; GFX9-NEXT: v_addc_co_u32_e32 v33, vcc, 0, v33, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: .LBB68_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v34 -; GFX9-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-NEXT: v_mov_b32_e32 v16, v32 -; GFX9-NEXT: v_mov_b32_e32 v20, v33 -; GFX9-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, v38 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: v_mov_b32_e32 v0, v38 +; GFX9-NEXT: v_mov_b32_e32 v4, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v12, v37 +; GFX9-NEXT: v_mov_b32_e32 v16, v34 +; GFX9-NEXT: v_mov_b32_e32 v20, v35 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v4i64_to_v32i8: @@ -20149,14 +20164,14 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v7, 0x300 -; VI-NEXT: v_add_u16_e32 v2, 3, v33 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_add_u16_e32 v1, 0x300, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v33 ; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 ; VI-NEXT: v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v8 @@ -20801,42 +20816,6 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v17 ; SI-NEXT: s_cbranch_scc0 .LBB71_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 -; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v23, v3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -20854,19 +20833,55 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_or_b32 s6, s5, s6 +; SI-NEXT: s_and_b32 s5, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s7, s5, s7 +; SI-NEXT: s_and_b32 s5, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v3, s5, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s7 ; SI-NEXT: s_cbranch_execnz .LBB71_3 ; SI-NEXT: .LBB71_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -20993,24 +21008,6 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 ; VI-NEXT: s_cbranch_scc0 .LBB71_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -21028,19 +21025,37 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_or_b32 s6, s5, s6 +; VI-NEXT: s_and_b32 s5, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s7, s5, s7 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s5, v0 +; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: s_cbranch_execnz .LBB71_3 ; VI-NEXT: .LBB71_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 @@ -21149,24 +21164,6 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v17 ; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s29, 8 -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -21184,19 +21181,37 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_or_b32 s6, s5, s6 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s7, s5, s7 +; GFX9-NEXT: s_and_b32 s5, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_cbranch_execnz .LBB71_3 ; GFX9-NEXT: .LBB71_2: ; %cmp.true ; GFX9-NEXT: s_add_i32 s16, s16, 3 @@ -22628,56 +22643,56 @@ define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: .LBB78_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 @@ -23441,18 +23456,18 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB82_2 ; SI-NEXT: .LBB82_4: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -24118,16 +24133,16 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; SI-NEXT: v_mul_f32_e64 v27, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v14, 1.0, s24 ; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 ; SI-NEXT: v_mul_f32_e64 v25, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: v_mul_f32_e32 v24, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v8, 1.0, v0 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 ; SI-NEXT: v_lshr_b64 v[0:1], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v30 ; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 16 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 ; SI-NEXT: v_lshr_b64 v[2:3], v[18:19], 16 @@ -24147,15 +24162,15 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v29 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -24208,153 +24223,154 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_mov_b32_e32 v8, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v8 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_add_f32_e32 v1, s4, v8 +; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[2:3] -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 -; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[2:3] -; VI-NEXT: v_add_f32_e32 v2, s4, v8 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 -; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v8 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[1:2] +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_cndmask_b32_e32 v13, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; VI-NEXT: v_add_f32_e32 v3, s4, v8 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; VI-NEXT: v_add_f32_e32 v3, s6, v8 -; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v7 -; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[0:1] -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[11:12] -; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[13:14] -; VI-NEXT: v_cndmask_b32_e64 v0, v5, v15, s[4:5] -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, v10 -; VI-NEXT: v_mov_b32_e32 v3, v9 -; VI-NEXT: v_mov_b32_e32 v5, v8 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 16, v[11:12] +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v12, v3, v5, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[12:13] +; VI-NEXT: v_mov_b32_e32 v1, v11 +; VI-NEXT: v_mov_b32_e32 v3, v10 +; VI-NEXT: v_mov_b32_e32 v5, v9 +; VI-NEXT: v_mov_b32_e32 v7, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -24944,15 +24960,21 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v4f64_to_v32i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v7 +; SI-NEXT: v_mov_b32_e32 v32, v6 ; SI-NEXT: v_mov_b32_e32 v35, v5 ; SI-NEXT: v_mov_b32_e32 v34, v4 +; SI-NEXT: v_mov_b32_e32 v37, v3 +; SI-NEXT: v_mov_b32_e32 v36, v2 +; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: v_mov_b32_e32 v38, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 @@ -24975,288 +24997,288 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB84_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v27, v7, v6, 24 -; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 -; SI-NEXT: v_alignbit_b32 v25, v7, v6, 8 +; SI-NEXT: v_alignbit_b32 v27, v33, v32, 24 +; SI-NEXT: v_alignbit_b32 v26, v33, v32, 16 +; SI-NEXT: v_alignbit_b32 v25, v33, v32, 8 ; SI-NEXT: v_alignbit_b32 v19, v35, v34, 24 ; SI-NEXT: v_alignbit_b32 v18, v35, v34, 16 ; SI-NEXT: v_alignbit_b32 v17, v35, v34, 8 -; SI-NEXT: v_alignbit_b32 v11, v3, v2, 24 -; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; SI-NEXT: v_alignbit_b32 v9, v3, v2, 8 -; SI-NEXT: v_alignbit_b32 v38, v1, v0, 24 -; SI-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v33, v1, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; SI-NEXT: v_alignbit_b32 v11, v37, v36, 24 +; SI-NEXT: v_alignbit_b32 v10, v37, v36, 16 +; SI-NEXT: v_alignbit_b32 v9, v37, v36, 8 +; SI-NEXT: v_alignbit_b32 v3, v39, v38, 24 +; SI-NEXT: v_alignbit_b32 v2, v39, v38, 16 +; SI-NEXT: v_alignbit_b32 v1, v39, v38, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 ; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 ; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 ; SI-NEXT: .LBB84_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB84_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_add_f64 v[38:39], v[38:39], 1.0 +; SI-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 +; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 ; SI-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 -; SI-NEXT: v_alignbit_b32 v27, v7, v6, 24 -; SI-NEXT: v_alignbit_b32 v26, v7, v6, 16 -; SI-NEXT: v_alignbit_b32 v25, v7, v6, 8 +; SI-NEXT: v_alignbit_b32 v27, v33, v32, 24 +; SI-NEXT: v_alignbit_b32 v26, v33, v32, 16 +; SI-NEXT: v_alignbit_b32 v25, v33, v32, 8 ; SI-NEXT: v_alignbit_b32 v19, v35, v34, 24 ; SI-NEXT: v_alignbit_b32 v18, v35, v34, 16 ; SI-NEXT: v_alignbit_b32 v17, v35, v34, 8 -; SI-NEXT: v_alignbit_b32 v11, v3, v2, 24 -; SI-NEXT: v_alignbit_b32 v10, v3, v2, 16 -; SI-NEXT: v_alignbit_b32 v9, v3, v2, 8 -; SI-NEXT: v_alignbit_b32 v38, v1, v0, 24 -; SI-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v33, v1, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; SI-NEXT: v_alignbit_b32 v11, v37, v36, 24 +; SI-NEXT: v_alignbit_b32 v10, v37, v36, 16 +; SI-NEXT: v_alignbit_b32 v9, v37, v36, 8 +; SI-NEXT: v_alignbit_b32 v3, v39, v38, 24 +; SI-NEXT: v_alignbit_b32 v2, v39, v38, 16 +; SI-NEXT: v_alignbit_b32 v1, v39, v38, 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 ; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 ; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 ; SI-NEXT: .LBB84_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v4, v1 -; SI-NEXT: v_mov_b32_e32 v8, v2 -; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v0, v38 +; SI-NEXT: v_mov_b32_e32 v4, v39 +; SI-NEXT: v_mov_b32_e32 v8, v36 +; SI-NEXT: v_mov_b32_e32 v12, v37 ; SI-NEXT: v_mov_b32_e32 v16, v34 ; SI-NEXT: v_mov_b32_e32 v20, v35 -; SI-NEXT: v_mov_b32_e32 v24, v6 -; SI-NEXT: v_mov_b32_e32 v28, v7 -; SI-NEXT: v_mov_b32_e32 v1, v33 -; SI-NEXT: v_mov_b32_e32 v2, v32 -; SI-NEXT: v_mov_b32_e32 v3, v38 -; SI-NEXT: v_mov_b32_e32 v6, v37 -; SI-NEXT: v_mov_b32_e32 v7, v36 +; SI-NEXT: v_mov_b32_e32 v24, v32 +; SI-NEXT: v_mov_b32_e32 v28, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v35, v5 +; VI-NEXT: v_mov_b32_e32 v34, v4 +; VI-NEXT: v_mov_b32_e32 v37, v3 +; VI-NEXT: v_mov_b32_e32 v36, v2 +; VI-NEXT: v_mov_b32_e32 v39, v1 +; VI-NEXT: v_mov_b32_e32 v38, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB84_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; VI-NEXT: .LBB84_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB84_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 +; VI-NEXT: v_add_f64 v[38:39], v[38:39], 1.0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; VI-NEXT: .LBB84_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mov_b32_e32 v8, v34 -; VI-NEXT: v_mov_b32_e32 v12, v35 -; VI-NEXT: v_mov_b32_e32 v16, v32 -; VI-NEXT: v_mov_b32_e32 v20, v33 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v28, v7 -; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v6, v37 -; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: v_mov_b32_e32 v0, v38 +; VI-NEXT: v_mov_b32_e32 v4, v39 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v12, v37 +; VI-NEXT: v_mov_b32_e32 v16, v34 +; VI-NEXT: v_mov_b32_e32 v20, v35 +; VI-NEXT: v_mov_b32_e32 v24, v32 +; VI-NEXT: v_mov_b32_e32 v28, v33 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v4f64_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 -; GFX9-NEXT: v_mov_b32_e32 v35, v3 -; GFX9-NEXT: v_mov_b32_e32 v34, v2 +; GFX9-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-NEXT: v_mov_b32_e32 v35, v5 +; GFX9-NEXT: v_mov_b32_e32 v34, v4 +; GFX9-NEXT: v_mov_b32_e32 v37, v3 +; GFX9-NEXT: v_mov_b32_e32 v36, v2 +; GFX9-NEXT: v_mov_b32_e32 v39, v1 +; GFX9-NEXT: v_mov_b32_e32 v38, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr14 ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr3 -; GFX9-NEXT: ; implicit-def: $vgpr11 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB84_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: .LBB84_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB84_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 +; GFX9-NEXT: v_add_f64 v[38:39], v[38:39], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: .LBB84_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v34 -; GFX9-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-NEXT: v_mov_b32_e32 v16, v32 -; GFX9-NEXT: v_mov_b32_e32 v20, v33 -; GFX9-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, v38 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: v_mov_b32_e32 v0, v38 +; GFX9-NEXT: v_mov_b32_e32 v4, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v12, v37 +; GFX9-NEXT: v_mov_b32_e32 v16, v34 +; GFX9-NEXT: v_mov_b32_e32 v20, v35 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v4f64_to_v32i8: @@ -25508,34 +25530,34 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB85_4 ; SI-NEXT: .LBB85_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[50:51], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[37:38], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[48:49], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[35:36], s[18:19], 1.0 -; SI-NEXT: v_lshr_b64 v[24:25], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[37:38], 16 -; SI-NEXT: v_lshr_b64 v[0:1], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[50:51], 24 -; SI-NEXT: v_lshr_b64 v[25:26], v[50:51], 8 -; SI-NEXT: v_lshr_b64 v[19:20], v[37:38], 24 -; SI-NEXT: v_lshr_b64 v[17:18], v[37:38], 8 -; SI-NEXT: v_lshr_b64 v[11:12], v[35:36], 24 -; SI-NEXT: v_lshr_b64 v[33:34], v[35:36], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[35:36], 8 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 -; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v51 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v51 -; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v38 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v38 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v36 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v49 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 +; SI-NEXT: v_add_f64 v[34:35], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[38:39], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[50:51], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[54:55], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[27:28], v[34:35], 24 +; SI-NEXT: v_lshr_b64 v[32:33], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[34:35], 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[38:39], 24 +; SI-NEXT: v_lshr_b64 v[36:37], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[38:39], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[50:51], 24 +; SI-NEXT: v_lshr_b64 v[48:49], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[50:51], 8 +; SI-NEXT: v_lshr_b64 v[3:4], v[54:55], 24 +; SI-NEXT: v_lshr_b64 v[52:53], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v35 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v39 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v39 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v51 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v51 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v55 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v55 ; SI-NEXT: s_branch .LBB85_5 ; SI-NEXT: .LBB85_3: ; SI-NEXT: ; implicit-def: $sgpr44 @@ -25564,14 +25586,14 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: s_branch .LBB85_2 ; SI-NEXT: .LBB85_4: -; SI-NEXT: v_mov_b32_e32 v51, s23 -; SI-NEXT: v_mov_b32_e32 v38, s21 -; SI-NEXT: v_mov_b32_e32 v36, s19 -; SI-NEXT: v_mov_b32_e32 v49, s17 -; SI-NEXT: v_mov_b32_e32 v48, s16 -; SI-NEXT: v_mov_b32_e32 v35, s18 -; SI-NEXT: v_mov_b32_e32 v37, s20 -; SI-NEXT: v_mov_b32_e32 v50, s22 +; SI-NEXT: v_mov_b32_e32 v35, s23 +; SI-NEXT: v_mov_b32_e32 v39, s21 +; SI-NEXT: v_mov_b32_e32 v51, s19 +; SI-NEXT: v_mov_b32_e32 v55, s17 +; SI-NEXT: v_mov_b32_e32 v54, s16 +; SI-NEXT: v_mov_b32_e32 v50, s18 +; SI-NEXT: v_mov_b32_e32 v38, s20 +; SI-NEXT: v_mov_b32_e32 v34, s22 ; SI-NEXT: v_mov_b32_e32 v31, s75 ; SI-NEXT: v_mov_b32_e32 v30, s74 ; SI-NEXT: v_mov_b32_e32 v29, s73 @@ -25585,30 +25607,30 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v6, s57 ; SI-NEXT: v_mov_b32_e32 v5, s56 ; SI-NEXT: v_mov_b32_e32 v1, s44 -; SI-NEXT: v_mov_b32_e32 v0, s42 +; SI-NEXT: v_mov_b32_e32 v52, s42 ; SI-NEXT: v_mov_b32_e32 v3, s40 ; SI-NEXT: v_mov_b32_e32 v9, s28 -; SI-NEXT: v_mov_b32_e32 v33, s26 +; SI-NEXT: v_mov_b32_e32 v48, s26 ; SI-NEXT: v_mov_b32_e32 v11, s24 ; SI-NEXT: v_mov_b32_e32 v17, s14 -; SI-NEXT: v_mov_b32_e32 v32, s12 +; SI-NEXT: v_mov_b32_e32 v36, s12 ; SI-NEXT: v_mov_b32_e32 v19, s10 ; SI-NEXT: v_mov_b32_e32 v25, s8 -; SI-NEXT: v_mov_b32_e32 v24, s6 +; SI-NEXT: v_mov_b32_e32 v32, s6 ; SI-NEXT: v_mov_b32_e32 v27, s4 ; SI-NEXT: .LBB85_5: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: v_mov_b32_e32 v0, v48 -; SI-NEXT: v_mov_b32_e32 v4, v49 -; SI-NEXT: v_mov_b32_e32 v10, v33 -; SI-NEXT: v_mov_b32_e32 v8, v35 -; SI-NEXT: v_mov_b32_e32 v12, v36 -; SI-NEXT: v_mov_b32_e32 v18, v32 -; SI-NEXT: v_mov_b32_e32 v16, v37 -; SI-NEXT: v_mov_b32_e32 v20, v38 -; SI-NEXT: v_mov_b32_e32 v26, v24 -; SI-NEXT: v_mov_b32_e32 v24, v50 -; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v0, v54 +; SI-NEXT: v_mov_b32_e32 v2, v52 +; SI-NEXT: v_mov_b32_e32 v4, v55 +; SI-NEXT: v_mov_b32_e32 v8, v50 +; SI-NEXT: v_mov_b32_e32 v10, v48 +; SI-NEXT: v_mov_b32_e32 v12, v51 +; SI-NEXT: v_mov_b32_e32 v16, v38 +; SI-NEXT: v_mov_b32_e32 v18, v36 +; SI-NEXT: v_mov_b32_e32 v20, v39 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v26, v32 +; SI-NEXT: v_mov_b32_e32 v28, v35 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v32i8_scalar: @@ -25643,34 +25665,34 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB85_4 ; VI-NEXT: .LBB85_2: ; %cmp.true -; VI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; VI-NEXT: v_add_f64 v[8:9], s[18:19], 1.0 -; VI-NEXT: v_add_f64 v[16:17], s[20:21], 1.0 -; VI-NEXT: v_add_f64 v[24:25], s[22:23], 1.0 -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v25 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v25 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v24 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; VI-NEXT: v_add_f64 v[32:33], s[22:23], 1.0 +; VI-NEXT: v_add_f64 v[34:35], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[36:37], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[38:39], s[16:17], 1.0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; VI-NEXT: s_branch .LBB85_5 ; VI-NEXT: .LBB85_3: ; VI-NEXT: ; implicit-def: $sgpr58 @@ -25699,22 +25721,26 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr43 ; VI-NEXT: s_branch .LBB85_2 ; VI-NEXT: .LBB85_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v8, s18 -; VI-NEXT: v_mov_b32_e32 v16, s20 -; VI-NEXT: v_mov_b32_e32 v24, s22 -; VI-NEXT: v_mov_b32_e32 v25, s23 -; VI-NEXT: v_mov_b32_e32 v17, s21 -; VI-NEXT: v_mov_b32_e32 v9, s19 -; VI-NEXT: v_mov_b32_e32 v1, s17 +; VI-NEXT: v_mov_b32_e32 v38, s16 +; VI-NEXT: v_mov_b32_e32 v36, s18 +; VI-NEXT: v_mov_b32_e32 v34, s20 +; VI-NEXT: v_mov_b32_e32 v32, s22 +; VI-NEXT: v_mov_b32_e32 v33, s23 +; VI-NEXT: v_mov_b32_e32 v35, s21 +; VI-NEXT: v_mov_b32_e32 v37, s19 +; VI-NEXT: v_mov_b32_e32 v39, s17 ; VI-NEXT: v_mov_b32_e32 v2, s59 -; VI-NEXT: v_mov_b32_e32 v35, s58 +; VI-NEXT: v_mov_b32_e32 v1, s58 ; VI-NEXT: v_mov_b32_e32 v10, s57 -; VI-NEXT: v_mov_b32_e32 v34, s56 +; VI-NEXT: v_mov_b32_e32 v9, s56 ; VI-NEXT: v_mov_b32_e32 v18, s47 -; VI-NEXT: v_mov_b32_e32 v33, s46 +; VI-NEXT: v_mov_b32_e32 v17, s46 ; VI-NEXT: v_mov_b32_e32 v26, s45 -; VI-NEXT: v_mov_b32_e32 v32, s44 +; VI-NEXT: v_mov_b32_e32 v25, s44 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v11, s6 +; VI-NEXT: v_mov_b32_e32 v19, s8 +; VI-NEXT: v_mov_b32_e32 v27, s10 ; VI-NEXT: v_mov_b32_e32 v31, s43 ; VI-NEXT: v_mov_b32_e32 v30, s42 ; VI-NEXT: v_mov_b32_e32 v29, s41 @@ -25727,19 +25753,15 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v7, s24 ; VI-NEXT: v_mov_b32_e32 v6, s15 ; VI-NEXT: v_mov_b32_e32 v5, s14 -; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: v_mov_b32_e32 v11, s6 -; VI-NEXT: v_mov_b32_e32 v19, s8 -; VI-NEXT: v_mov_b32_e32 v27, s10 ; VI-NEXT: .LBB85_5: ; %end -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mov_b32_e32 v12, v9 -; VI-NEXT: v_mov_b32_e32 v20, v17 -; VI-NEXT: v_mov_b32_e32 v28, v25 -; VI-NEXT: v_mov_b32_e32 v1, v35 -; VI-NEXT: v_mov_b32_e32 v9, v34 -; VI-NEXT: v_mov_b32_e32 v17, v33 -; VI-NEXT: v_mov_b32_e32 v25, v32 +; VI-NEXT: v_mov_b32_e32 v0, v38 +; VI-NEXT: v_mov_b32_e32 v4, v39 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v12, v37 +; VI-NEXT: v_mov_b32_e32 v16, v34 +; VI-NEXT: v_mov_b32_e32 v20, v35 +; VI-NEXT: v_mov_b32_e32 v24, v32 +; VI-NEXT: v_mov_b32_e32 v28, v33 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v4f64_to_v32i8_scalar: @@ -25774,34 +25796,34 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB85_4 ; GFX9-NEXT: .LBB85_2: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[8:9], s[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[16:17], s[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[24:25], s[22:23], 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; GFX9-NEXT: v_add_f64 v[32:33], s[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[34:35], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[36:37], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[38:39], s[16:17], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: s_branch .LBB85_5 ; GFX9-NEXT: .LBB85_3: ; GFX9-NEXT: ; implicit-def: $sgpr58 @@ -25830,22 +25852,26 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr43 ; GFX9-NEXT: s_branch .LBB85_2 ; GFX9-NEXT: .LBB85_4: -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v8, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s20 -; GFX9-NEXT: v_mov_b32_e32 v24, s22 -; GFX9-NEXT: v_mov_b32_e32 v25, s23 -; GFX9-NEXT: v_mov_b32_e32 v17, s21 -; GFX9-NEXT: v_mov_b32_e32 v9, s19 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v38, s16 +; GFX9-NEXT: v_mov_b32_e32 v36, s18 +; GFX9-NEXT: v_mov_b32_e32 v34, s20 +; GFX9-NEXT: v_mov_b32_e32 v32, s22 +; GFX9-NEXT: v_mov_b32_e32 v33, s23 +; GFX9-NEXT: v_mov_b32_e32 v35, s21 +; GFX9-NEXT: v_mov_b32_e32 v37, s19 +; GFX9-NEXT: v_mov_b32_e32 v39, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s59 -; GFX9-NEXT: v_mov_b32_e32 v35, s58 +; GFX9-NEXT: v_mov_b32_e32 v1, s58 ; GFX9-NEXT: v_mov_b32_e32 v10, s57 -; GFX9-NEXT: v_mov_b32_e32 v34, s56 +; GFX9-NEXT: v_mov_b32_e32 v9, s56 ; GFX9-NEXT: v_mov_b32_e32 v18, s47 -; GFX9-NEXT: v_mov_b32_e32 v33, s46 +; GFX9-NEXT: v_mov_b32_e32 v17, s46 ; GFX9-NEXT: v_mov_b32_e32 v26, s45 -; GFX9-NEXT: v_mov_b32_e32 v32, s44 +; GFX9-NEXT: v_mov_b32_e32 v25, s44 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v11, s6 +; GFX9-NEXT: v_mov_b32_e32 v19, s8 +; GFX9-NEXT: v_mov_b32_e32 v27, s10 ; GFX9-NEXT: v_mov_b32_e32 v31, s43 ; GFX9-NEXT: v_mov_b32_e32 v30, s42 ; GFX9-NEXT: v_mov_b32_e32 v29, s41 @@ -25858,19 +25884,15 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v7, s24 ; GFX9-NEXT: v_mov_b32_e32 v6, s15 ; GFX9-NEXT: v_mov_b32_e32 v5, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v11, s6 -; GFX9-NEXT: v_mov_b32_e32 v19, s8 -; GFX9-NEXT: v_mov_b32_e32 v27, s10 ; GFX9-NEXT: .LBB85_5: ; %end -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v12, v9 -; GFX9-NEXT: v_mov_b32_e32 v20, v17 -; GFX9-NEXT: v_mov_b32_e32 v28, v25 -; GFX9-NEXT: v_mov_b32_e32 v1, v35 -; GFX9-NEXT: v_mov_b32_e32 v9, v34 -; GFX9-NEXT: v_mov_b32_e32 v17, v33 -; GFX9-NEXT: v_mov_b32_e32 v25, v32 +; GFX9-NEXT: v_mov_b32_e32 v0, v38 +; GFX9-NEXT: v_mov_b32_e32 v4, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v12, v37 +; GFX9-NEXT: v_mov_b32_e32 v16, v34 +; GFX9-NEXT: v_mov_b32_e32 v20, v35 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: bitcast_v4f64_to_v32i8_scalar: @@ -26339,14 +26361,14 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v7, 0x300 -; VI-NEXT: v_add_u16_e32 v2, 3, v33 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_add_u16_e32 v1, 0x300, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v33 ; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 ; VI-NEXT: v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v8 @@ -26991,42 +27013,6 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v17 ; SI-NEXT: s_cbranch_scc0 .LBB87_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 -; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v23, v3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -27044,19 +27030,55 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_or_b32 s6, s5, s6 +; SI-NEXT: s_and_b32 s5, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s7, s5, s7 +; SI-NEXT: s_and_b32 s5, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 +; SI-NEXT: s_or_b32 s5, s5, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v3, s5, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_or_b32_e32 v0, v0, v26 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s7 ; SI-NEXT: s_cbranch_execnz .LBB87_3 ; SI-NEXT: .LBB87_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -27183,24 +27205,6 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 ; VI-NEXT: s_cbranch_scc0 .LBB87_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -27218,19 +27222,37 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_or_b32 s6, s5, s6 +; VI-NEXT: s_and_b32 s5, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s7, s5, s7 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s5, v0 +; VI-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: s_cbranch_execnz .LBB87_3 ; VI-NEXT: .LBB87_2: ; %cmp.true ; VI-NEXT: s_add_i32 s16, s16, 3 @@ -27339,24 +27361,6 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v17 ; GFX9-NEXT: s_cbranch_scc0 .LBB87_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s29, 8 -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -27374,19 +27378,37 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff +; GFX9-NEXT: s_or_b32 s6, s5, s6 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s7, s5, s7 +; GFX9-NEXT: s_and_b32 s5, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_cbranch_execnz .LBB87_3 ; GFX9-NEXT: .LBB87_2: ; %cmp.true ; GFX9-NEXT: s_add_i32 s16, s16, 3 @@ -28319,24 +28341,25 @@ define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i ; SI-LABEL: bitcast_v16f16_to_v16i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v9, v0 +; SI-NEXT: v_mov_b32_e32 v5, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v18, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v16, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB91_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -28352,59 +28375,59 @@ define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_or_b32_e32 v14, v14, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v10, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_or_b32_e32 v10, v10, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v6, v6, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v16 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshr_b64 v[18:19], v[1:2], 16 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshr_b64 v[18:19], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[9:10], 16 ; SI-NEXT: v_lshr_b64 v[16:17], v[13:14], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 @@ -28412,8 +28435,8 @@ define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: .LBB91_3: ; %end ; SI-NEXT: v_mov_b32_e32 v1, v18 -; SI-NEXT: v_mov_b32_e32 v5, v21 -; SI-NEXT: v_mov_b32_e32 v9, v19 +; SI-NEXT: v_mov_b32_e32 v5, v19 +; SI-NEXT: v_mov_b32_e32 v9, v20 ; SI-NEXT: v_mov_b32_e32 v13, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB91_4: @@ -28432,25 +28455,18 @@ define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i ; VI-NEXT: s_lshr_b32 s4, s17, 16 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s18, 16 -; VI-NEXT: s_lshr_b32 s5, s22, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x200 ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v6, s5 -; VI-NEXT: s_lshr_b32 s5, s23, 16 ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_lshr_b32 s4, s20, 16 -; VI-NEXT: v_add_f16_e32 v5, s22, v0 -; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v13, s5 -; VI-NEXT: v_add_f16_e32 v7, s23, v0 -; VI-NEXT: v_add_f16_sdwa v13, v13, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v5, v6 ; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_or_b32_e32 v7, v7, v13 -; VI-NEXT: v_add_f16_sdwa v13, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v16, s4 ; VI-NEXT: v_add_f16_e32 v8, s16, v0 ; VI-NEXT: v_add_f16_sdwa v9, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s17, v0 @@ -28460,9 +28476,16 @@ define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i ; VI-NEXT: v_add_f16_e32 v3, s19, v0 ; VI-NEXT: v_add_f16_sdwa v12, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v4, s20, v0 -; VI-NEXT: v_add_f16_sdwa v5, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s21, v0 -; VI-NEXT: v_or_b32_e32 v5, v0, v5 +; VI-NEXT: v_add_f16_sdwa v13, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s21, v0 +; VI-NEXT: v_add_f16_sdwa v14, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s22, v0 +; VI-NEXT: v_add_f16_sdwa v15, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s23, v0 +; VI-NEXT: v_add_f16_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v0 +; VI-NEXT: v_or_b32_e32 v6, v6, v15 +; VI-NEXT: v_or_b32_e32 v5, v5, v14 ; VI-NEXT: v_or_b32_e32 v4, v4, v13 ; VI-NEXT: v_or_b32_e32 v3, v3, v12 ; VI-NEXT: v_or_b32_e32 v2, v2, v11 @@ -29792,20 +29815,20 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v29 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v27 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 @@ -29877,20 +29900,20 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v16bf16_to_v16i16_scalar: @@ -29902,8 +29925,8 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; VI-NEXT: s_cbranch_execnz .LBB95_4 ; VI-NEXT: .LBB95_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v10, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v10 +; VI-NEXT: v_mov_b32_e32 v15, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v15 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 @@ -29911,7 +29934,7 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_add_f32_e32 v1, s4, v10 +; VI-NEXT: v_add_f32_e32 v1, s4, v15 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 @@ -29919,14 +29942,14 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_add_f32_e32 v2, s4, v15 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_add_f32_e32 v2, s4, v15 ; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -29936,7 +29959,7 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_add_f32_e32 v2, s4, v15 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 @@ -29944,7 +29967,7 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v10 +; VI-NEXT: v_add_f32_e32 v3, s4, v15 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 @@ -29952,15 +29975,15 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_add_f32_e32 v4, s4, v15 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v4, s4, v10 -; VI-NEXT: v_cndmask_b32_e32 v11, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v15 +; VI-NEXT: v_cndmask_b32_e32 v10, v5, v6, vcc ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 @@ -29968,8 +29991,8 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; VI-NEXT: v_add_f32_e32 v4, s4, v15 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 @@ -29977,79 +30000,79 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s4, v10 +; VI-NEXT: v_add_f32_e32 v5, s4, v15 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v13, s5, v10 -; VI-NEXT: s_lshl_b32 s5, s22, 16 +; VI-NEXT: s_lshl_b32 s4, s21, 16 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s5, v10 +; VI-NEXT: v_add_f32_e32 v6, s4, v15 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v6, s4, v15 +; VI-NEXT: v_cndmask_b32_e32 v12, v7, v12, vcc +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v13, vcc +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; VI-NEXT: v_add_f32_e32 v6, s4, v15 ; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_or_b32_e32 v14, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v6, v7, v14, vcc -; VI-NEXT: v_add_f32_e32 v7, s5, v10 +; VI-NEXT: v_add_f32_e32 v7, s4, v15 ; VI-NEXT: v_bfe_u32 v14, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v7 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] -; VI-NEXT: v_cndmask_b32_e32 v7, v14, v15, vcc -; VI-NEXT: v_bfe_u32 v5, v13, 16, 1 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cndmask_b32_e32 v7, v14, v16, vcc +; VI-NEXT: v_add_f32_e32 v14, s4, v15 +; VI-NEXT: v_bfe_u32 v16, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v14 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_add_f32_e32 v15, s4, v15 +; VI-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v13 -; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v13, s4, v10 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v7, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v13 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v7, v14, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v10 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v15, vcc -; VI-NEXT: v_add_f32_e32 v7, s4, v10 -; VI-NEXT: v_bfe_u32 v10, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v7 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v15, v10, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[13:14], 16, v[13:14] -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[14:15] +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[12:13] +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] ; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] ; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, v7 ; VI-NEXT: v_mov_b32_e32 v3, v10 -; VI-NEXT: v_mov_b32_e32 v5, v13 -; VI-NEXT: v_mov_b32_e32 v7, v15 +; VI-NEXT: v_mov_b32_e32 v5, v12 +; VI-NEXT: v_mov_b32_e32 v7, v14 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -30133,91 +30156,91 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: s_and_b32 s5, s22, 0xffff0000 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 ; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_add_f32_e32 v4, s5, v1 +; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 ; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: s_lshl_b32 s5, s22, 16 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add_f32_e32 v5, s5, v1 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v6, v7, vcc ; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 ; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: s_and_b32 s5, s23, 0xffff0000 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX9-NEXT: v_add_f32_e32 v6, s5, v1 +; GFX9-NEXT: v_add_f32_e32 v6, s4, v1 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_add_f32_e32 v6, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v7, v14, vcc ; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 ; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 -; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: s_lshl_b32 s5, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v13, vcc -; GFX9-NEXT: v_add_f32_e32 v7, s5, v1 -; GFX9-NEXT: v_bfe_u32 v13, v7, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v13, v13, v7 -; GFX9-NEXT: v_add_u32_e32 v13, 0x7fff, v13 -; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v7 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v15, vcc +; GFX9-NEXT: v_add_f32_e32 v7, s4, v1 +; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v15, v15, v7 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v15, 0x7fff, v15 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_add_f32_e32 v7, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v16, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v16, v16, v7 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NEXT: v_mov_b32_e32 v13, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_and_or_b32 v7, v6, v13, v7 -; GFX9-NEXT: v_and_or_b32 v6, v4, v13, v5 -; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 -; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v14, vcc -; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 -; GFX9-NEXT: v_bfe_u32 v14, v5, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v5 -; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v15, vcc -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_and_or_b32 v5, v4, v13, v5 -; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 -; GFX9-NEXT: v_bfe_u32 v14, v4, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v4 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v14, v15, vcc -; GFX9-NEXT: v_bfe_u32 v14, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v1 -; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v16, v16, v1 +; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v4, v4, v13, v1 +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v7, v7, v16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; GFX9-NEXT: v_and_or_b32 v6, v6, v16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; GFX9-NEXT: v_and_or_b32 v5, v5, v16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; GFX9-NEXT: v_and_or_b32 v4, v4, v16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GFX9-NEXT: v_and_or_b32 v3, v3, v13, v1 +; GFX9-NEXT: v_and_or_b32 v3, v3, v16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; GFX9-NEXT: v_and_or_b32 v2, v2, v13, v1 +; GFX9-NEXT: v_and_or_b32 v2, v2, v16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_and_or_b32 v1, v9, v13, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v13, v8 +; GFX9-NEXT: v_and_or_b32 v1, v9, v16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v16, v8 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB95_3: ; GFX9-NEXT: s_branch .LBB95_2 @@ -30773,39 +30796,43 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; VI-LABEL: bitcast_v16i16_to_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v34, v5 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v37, v3 +; VI-NEXT: v_mov_b32_e32 v36, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr5 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr8 ; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB96_2 @@ -30813,93 +30840,90 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 ; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v5 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v34 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 ; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 ; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[4:5] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[33:34] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; VI-NEXT: v_mov_b32_e32 v50, v0 ; VI-NEXT: v_mov_b32_e32 v48, v1 -; VI-NEXT: v_mov_b32_e32 v8, v2 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v16, v4 -; VI-NEXT: v_mov_b32_e32 v49, v5 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v35, v37 +; VI-NEXT: v_mov_b32_e32 v16, v33 +; VI-NEXT: v_mov_b32_e32 v49, v34 ; VI-NEXT: v_mov_b32_e32 v24, v6 ; VI-NEXT: v_mov_b32_e32 v51, v7 ; VI-NEXT: ; implicit-def: $vgpr1 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: .LBB96_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v9, 3 -; VI-NEXT: v_add_u16_sdwa v36, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v32, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v14, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v22, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v18, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v30, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v26, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, 3 +; VI-NEXT: v_add_u16_sdwa v14, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v35, 3, v37 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_add_u16_sdwa v10, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v37, v35, v4 +; VI-NEXT: v_add_u16_e32 v8, 3, v36 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; VI-NEXT: v_add_u16_sdwa v22, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v36, v8, v4 +; VI-NEXT: v_add_u16_e32 v49, 3, v34 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; VI-NEXT: v_add_u16_sdwa v18, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v32, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v2, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v34, v49, v4 +; VI-NEXT: v_add_u16_e32 v16, 3, v33 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; VI-NEXT: v_add_u16_sdwa v30, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v26, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v48, 3, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 ; VI-NEXT: v_add_u16_e32 v50, 3, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 -; VI-NEXT: v_add_u16_e32 v35, 3, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; VI-NEXT: v_add_u16_e32 v8, 3, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; VI-NEXT: v_add_u16_e32 v49, 3, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; VI-NEXT: v_add_u16_e32 v16, 3, v4 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_or_b32_e32 v33, v16, v4 ; VI-NEXT: v_add_u16_e32 v51, 3, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 ; VI-NEXT: v_add_u16_e32 v24, 3, v6 -; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 ; VI-NEXT: v_or_b32_e32 v1, v48, v1 ; VI-NEXT: v_or_b32_e32 v0, v50, v0 -; VI-NEXT: v_or_b32_e32 v3, v35, v3 -; VI-NEXT: v_or_b32_e32 v2, v8, v2 -; VI-NEXT: v_or_b32_e32 v5, v49, v5 -; VI-NEXT: v_or_b32_e32 v4, v16, v4 -; VI-NEXT: v_or_b32_e32 v7, v51, v7 -; VI-NEXT: v_or_b32_e32 v6, v24, v6 +; VI-NEXT: v_or_b32_e32 v7, v51, v4 +; VI-NEXT: v_or_b32_e32 v6, v24, v3 ; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[4:5] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[33:34] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 ; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 ; VI-NEXT: v_bfe_u32 v31, v30, 8, 8 ; VI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 -; VI-NEXT: v_bfe_u32 v39, v36, 8, 8 +; VI-NEXT: v_bfe_u32 v39, v32, 8, 8 ; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v50 ; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v2, v32 -; VI-NEXT: v_mov_b32_e32 v3, v33 ; VI-NEXT: v_mov_b32_e32 v4, v48 -; VI-NEXT: v_mov_b32_e32 v5, v37 -; VI-NEXT: v_mov_b32_e32 v6, v36 +; VI-NEXT: v_mov_b32_e32 v6, v32 ; VI-NEXT: v_mov_b32_e32 v7, v39 ; VI-NEXT: v_mov_b32_e32 v12, v35 ; VI-NEXT: v_mov_b32_e32 v20, v49 @@ -30909,111 +30933,113 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { ; GFX9-LABEL: bitcast_v16i16_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 -; GFX9-NEXT: v_mov_b32_e32 v35, v3 -; GFX9-NEXT: v_mov_b32_e32 v34, v2 +; GFX9-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-NEXT: v_mov_b32_e32 v35, v5 +; GFX9-NEXT: v_mov_b32_e32 v34, v4 +; GFX9-NEXT: v_mov_b32_e32 v37, v3 +; GFX9-NEXT: v_mov_b32_e32 v36, v2 +; GFX9-NEXT: v_mov_b32_e32 v39, v1 +; GFX9-NEXT: v_mov_b32_e32 v38, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr14 ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr3 -; GFX9-NEXT: ; implicit-def: $vgpr11 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB96_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: .LBB96_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB96_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v39, v39, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v38, v38, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v37, v37, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v36, v36, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v35, v35, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v34, v34, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v33, v33, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: .LBB96_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v34 -; GFX9-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-NEXT: v_mov_b32_e32 v16, v32 -; GFX9-NEXT: v_mov_b32_e32 v20, v33 -; GFX9-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, v38 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: v_mov_b32_e32 v0, v38 +; GFX9-NEXT: v_mov_b32_e32 v4, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v12, v37 +; GFX9-NEXT: v_mov_b32_e32 v16, v34 +; GFX9-NEXT: v_mov_b32_e32 v20, v35 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v16i16_to_v32i8: @@ -31270,6 +31296,7 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: s_and_b32 s11, s79, 0xffff ; SI-NEXT: s_lshl_b32 s13, s78, 16 +; SI-NEXT: s_or_b32 s11, s11, s13 ; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 8 ; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 @@ -31277,23 +31304,22 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 8 ; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 24 ; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 -; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 8 ; SI-NEXT: s_lshr_b32 s13, s5, 8 ; SI-NEXT: s_lshr_b32 s41, s7, 8 ; SI-NEXT: s_lshr_b32 s47, s9, 8 -; SI-NEXT: s_lshr_b32 s88, s11, 8 +; SI-NEXT: s_lshr_b32 s61, s11, 8 ; SI-NEXT: s_and_b32 s15, s19, 0xffff ; SI-NEXT: s_and_b32 s45, s23, 0xffff ; SI-NEXT: s_and_b32 s59, s27, 0xffff -; SI-NEXT: s_and_b32 s90, s78, 0xffff +; SI-NEXT: s_and_b32 s73, s78, 0xffff ; SI-NEXT: s_bfe_u32 s43, s19, 0x80008 ; SI-NEXT: s_bfe_u32 s57, s23, 0x80008 -; SI-NEXT: s_bfe_u32 s89, s27, 0x80008 -; SI-NEXT: s_bfe_u32 s91, s78, 0x80008 -; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 8 +; SI-NEXT: s_bfe_u32 s63, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s75, s78, 0x80008 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true ; SI-NEXT: s_add_i32 s28, s28, 3 @@ -31345,8 +31371,8 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 24 ; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 8 ; SI-NEXT: s_lshr_b32 s43, s5, 24 ; SI-NEXT: s_lshr_b32 s15, s5, 16 @@ -31354,12 +31380,12 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s57, s7, 24 ; SI-NEXT: s_lshr_b32 s45, s7, 16 ; SI-NEXT: s_lshr_b32 s41, s7, 8 -; SI-NEXT: s_lshr_b32 s89, s9, 24 +; SI-NEXT: s_lshr_b32 s63, s9, 24 ; SI-NEXT: s_lshr_b32 s59, s9, 16 ; SI-NEXT: s_lshr_b32 s47, s9, 8 -; SI-NEXT: s_lshr_b32 s91, s11, 24 -; SI-NEXT: s_lshr_b32 s90, s11, 16 -; SI-NEXT: s_lshr_b32 s88, s11, 8 +; SI-NEXT: s_lshr_b32 s75, s11, 24 +; SI-NEXT: s_lshr_b32 s73, s11, 16 +; SI-NEXT: s_lshr_b32 s61, s11, 8 ; SI-NEXT: .LBB97_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s40 @@ -31384,15 +31410,15 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v20, s9 ; SI-NEXT: v_mov_b32_e32 v21, s47 ; SI-NEXT: v_mov_b32_e32 v22, s59 -; SI-NEXT: v_mov_b32_e32 v23, s89 +; SI-NEXT: v_mov_b32_e32 v23, s63 ; SI-NEXT: v_mov_b32_e32 v24, s10 ; SI-NEXT: v_mov_b32_e32 v25, s74 -; SI-NEXT: v_mov_b32_e32 v26, s62 -; SI-NEXT: v_mov_b32_e32 v27, s72 +; SI-NEXT: v_mov_b32_e32 v26, s72 +; SI-NEXT: v_mov_b32_e32 v27, s62 ; SI-NEXT: v_mov_b32_e32 v28, s11 -; SI-NEXT: v_mov_b32_e32 v29, s88 -; SI-NEXT: v_mov_b32_e32 v30, s90 -; SI-NEXT: v_mov_b32_e32 v31, s91 +; SI-NEXT: v_mov_b32_e32 v29, s61 +; SI-NEXT: v_mov_b32_e32 v30, s73 +; SI-NEXT: v_mov_b32_e32 v31, s75 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: ; SI-NEXT: ; implicit-def: $sgpr4 @@ -31415,14 +31441,14 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr91 ; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v16i16_to_v32i8_scalar: @@ -31614,38 +31640,38 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB97_4 ; GFX9-NEXT: .LBB97_2: ; %cmp.true -; GFX9-NEXT: v_pk_add_u16 v1, s17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v0, s16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v9, s19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, s18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, s21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v16, s20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v25, s23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v24, s22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; GFX9-NEXT: v_pk_add_u16 v39, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v38, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v37, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v36, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v35, s21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v34, s20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v33, s23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, s22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: s_branch .LBB97_5 ; GFX9-NEXT: .LBB97_3: ; GFX9-NEXT: ; implicit-def: $sgpr59 @@ -31674,30 +31700,30 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr14 ; GFX9-NEXT: s_branch .LBB97_2 ; GFX9-NEXT: .LBB97_4: -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v8, s18 -; GFX9-NEXT: v_mov_b32_e32 v9, s19 -; GFX9-NEXT: v_mov_b32_e32 v16, s20 -; GFX9-NEXT: v_mov_b32_e32 v17, s21 -; GFX9-NEXT: v_mov_b32_e32 v24, s22 -; GFX9-NEXT: v_mov_b32_e32 v25, s23 -; GFX9-NEXT: v_mov_b32_e32 v35, s59 +; GFX9-NEXT: v_mov_b32_e32 v38, s16 +; GFX9-NEXT: v_mov_b32_e32 v39, s17 +; GFX9-NEXT: v_mov_b32_e32 v36, s18 +; GFX9-NEXT: v_mov_b32_e32 v37, s19 +; GFX9-NEXT: v_mov_b32_e32 v34, s20 +; GFX9-NEXT: v_mov_b32_e32 v35, s21 +; GFX9-NEXT: v_mov_b32_e32 v32, s22 +; GFX9-NEXT: v_mov_b32_e32 v33, s23 +; GFX9-NEXT: v_mov_b32_e32 v1, s59 ; GFX9-NEXT: v_mov_b32_e32 v2, s57 ; GFX9-NEXT: v_mov_b32_e32 v5, s58 ; GFX9-NEXT: v_mov_b32_e32 v6, s56 ; GFX9-NEXT: v_mov_b32_e32 v7, s47 -; GFX9-NEXT: v_mov_b32_e32 v34, s46 +; GFX9-NEXT: v_mov_b32_e32 v9, s46 ; GFX9-NEXT: v_mov_b32_e32 v10, s44 ; GFX9-NEXT: v_mov_b32_e32 v13, s45 ; GFX9-NEXT: v_mov_b32_e32 v14, s43 ; GFX9-NEXT: v_mov_b32_e32 v15, s42 -; GFX9-NEXT: v_mov_b32_e32 v33, s41 +; GFX9-NEXT: v_mov_b32_e32 v17, s41 ; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_mov_b32_e32 v21, s40 ; GFX9-NEXT: v_mov_b32_e32 v22, s28 ; GFX9-NEXT: v_mov_b32_e32 v23, s27 -; GFX9-NEXT: v_mov_b32_e32 v32, s26 +; GFX9-NEXT: v_mov_b32_e32 v25, s26 ; GFX9-NEXT: v_mov_b32_e32 v26, s24 ; GFX9-NEXT: v_mov_b32_e32 v29, s25 ; GFX9-NEXT: v_mov_b32_e32 v30, s15 @@ -31707,14 +31733,14 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v11, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: .LBB97_5: ; %end -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v12, v9 -; GFX9-NEXT: v_mov_b32_e32 v20, v17 -; GFX9-NEXT: v_mov_b32_e32 v28, v25 -; GFX9-NEXT: v_mov_b32_e32 v1, v35 -; GFX9-NEXT: v_mov_b32_e32 v9, v34 -; GFX9-NEXT: v_mov_b32_e32 v17, v33 -; GFX9-NEXT: v_mov_b32_e32 v25, v32 +; GFX9-NEXT: v_mov_b32_e32 v0, v38 +; GFX9-NEXT: v_mov_b32_e32 v4, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v12, v37 +; GFX9-NEXT: v_mov_b32_e32 v16, v34 +; GFX9-NEXT: v_mov_b32_e32 v20, v35 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: bitcast_v16i16_to_v32i8_scalar: @@ -32822,8 +32848,8 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_mov_b32_e32 v22, v14 ; SI-NEXT: v_mov_b32_e32 v21, v10 -; SI-NEXT: v_readfirstlane_b32 s43, v1 -; SI-NEXT: v_readfirstlane_b32 s42, v0 +; SI-NEXT: v_readfirstlane_b32 s15, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 @@ -32842,75 +32868,75 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s12, s6, s5 -; SI-NEXT: s_or_b32 s6, s4, s12 +; SI-NEXT: s_or_b32 s40, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s40 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v4 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_or_b32_e32 v9, v9, v23 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_or_b32 s14, s7, s5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v26, v3, v10 +; SI-NEXT: s_or_b32 s42, s7, s5 +; SI-NEXT: v_or_b32_e32 v19, v9, v26 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v12 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: v_or_b32_e32 v9, v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v28, v24, v10 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: v_or_b32_e32 v17, v9, v28 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s13, s5, s7 -; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 16 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v8 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: v_or_b32_e32 v9, v9, v1 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_lshr_b64 s[8:9], s[40:41], 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v11, v0, v10 ; SI-NEXT: s_and_b32 s5, s28, 0xff ; SI-NEXT: s_lshl_b32 s9, s29, 8 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v27, v9, v11 ; SI-NEXT: s_or_b32 s5, s5, s9 -; SI-NEXT: s_and_b32 s9, s42, 0xff -; SI-NEXT: v_or_b32_e32 v9, v9, v23 -; SI-NEXT: v_or_b32_e32 v13, v24, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v15, v0, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_and_b32 s9, s14, 0xff +; SI-NEXT: v_lshr_b64 v[9:10], v[26:27], 16 ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s10, s43, 24 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v10, v10, v1 -; SI-NEXT: v_or_b32_e32 v14, v14, v7 -; SI-NEXT: v_or_b32_e32 v26, v5, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v21 +; SI-NEXT: s_lshl_b32 s10, s15, 24 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s12, s10, s9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v10, v10, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_or_b32 s43, s5, s12 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v17, v17, v25 -; SI-NEXT: s_or_b32 s15, s5, s12 +; SI-NEXT: v_or_b32_e32 v15, v5, v13 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v9, v3, v9 -; SI-NEXT: v_or_b32_e32 v10, v10, v15 -; SI-NEXT: v_or_b32_e32 v14, v14, v26 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 16 -; SI-NEXT: s_or_b32 s4, s4, s14 -; SI-NEXT: v_or_b32_e32 v19, v11, v9 -; SI-NEXT: v_mov_b32_e32 v20, v10 -; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_or_b32_e32 v17, v17, v13 -; SI-NEXT: v_mov_b32_e32 v18, v14 -; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[42:43], 16 +; SI-NEXT: v_or_b32_e32 v29, v10, v15 +; SI-NEXT: s_or_b32 s4, s4, s42 +; SI-NEXT: v_lshr_b64 v[13:14], v[28:29], 16 ; SI-NEXT: s_lshr_b32 s9, s7, 16 ; SI-NEXT: s_lshr_b32 s11, s12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 -; SI-NEXT: s_mov_b32 s7, s13 -; SI-NEXT: s_mov_b32 s5, s15 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_mov_b32 s7, s41 +; SI-NEXT: s_mov_b32 s5, s43 +; SI-NEXT: v_mov_b32_e32 v20, v27 +; SI-NEXT: v_mov_b32_e32 v18, v29 ; SI-NEXT: s_cbranch_execnz .LBB99_3 ; SI-NEXT: .LBB99_2: ; %cmp.true ; SI-NEXT: s_add_i32 s24, s24, 3 @@ -32930,19 +32956,19 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s28, 0xff ; SI-NEXT: s_lshl_b32 s6, s29, 8 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s42, 0xff +; SI-NEXT: s_and_b32 s7, s14, 0xff ; SI-NEXT: v_or_b32_e32 v9, v25, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_or_b32_e32 v2, v23, v2 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_lshl_b32 s6, s43, 24 +; SI-NEXT: s_lshl_b32 s6, s15, 24 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 @@ -33066,24 +33092,6 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_cbranch_scc0 .LBB99_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v20, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -33101,19 +33109,37 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_or_b32 s6, s5, s6 +; VI-NEXT: s_and_b32 s5, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s7, s5, s7 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s5, v0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: s_cbranch_execnz .LBB99_3 ; VI-NEXT: .LBB99_2: ; %cmp.true ; VI-NEXT: s_add_i32 s28, s28, 3 @@ -33222,53 +33248,53 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 ; GFX9-NEXT: s_cbranch_scc0 .LBB99_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s29, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v20, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s5, s6 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s5, s7 +; GFX9-NEXT: s_and_b32 s5, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_or_b32_sdwa v4, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_cbranch_execnz .LBB99_3 ; GFX9-NEXT: .LBB99_2: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v3, 3, v12 @@ -34104,25 +34130,18 @@ define inreg <16 x bfloat> @bitcast_v16f16_to_v16bf16_scalar(<16 x half> inreg % ; VI-NEXT: s_lshr_b32 s4, s17, 16 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_lshr_b32 s4, s18, 16 -; VI-NEXT: s_lshr_b32 s5, s22, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x200 ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s19, 16 -; VI-NEXT: v_mov_b32_e32 v6, s5 -; VI-NEXT: s_lshr_b32 s5, s23, 16 ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_lshr_b32 s4, s20, 16 -; VI-NEXT: v_add_f16_e32 v5, s22, v0 -; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v13, s5 -; VI-NEXT: v_add_f16_e32 v7, s23, v0 -; VI-NEXT: v_add_f16_sdwa v13, v13, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v5, v6 ; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_or_b32_e32 v7, v7, v13 -; VI-NEXT: v_add_f16_sdwa v13, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v16, s4 ; VI-NEXT: v_add_f16_e32 v8, s16, v0 ; VI-NEXT: v_add_f16_sdwa v9, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s17, v0 @@ -34132,9 +34151,16 @@ define inreg <16 x bfloat> @bitcast_v16f16_to_v16bf16_scalar(<16 x half> inreg % ; VI-NEXT: v_add_f16_e32 v3, s19, v0 ; VI-NEXT: v_add_f16_sdwa v12, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v4, s20, v0 -; VI-NEXT: v_add_f16_sdwa v5, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s21, v0 -; VI-NEXT: v_or_b32_e32 v5, v0, v5 +; VI-NEXT: v_add_f16_sdwa v13, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s21, v0 +; VI-NEXT: v_add_f16_sdwa v14, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s22, v0 +; VI-NEXT: v_add_f16_sdwa v15, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s23, v0 +; VI-NEXT: v_add_f16_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v0 +; VI-NEXT: v_or_b32_e32 v6, v6, v15 +; VI-NEXT: v_or_b32_e32 v5, v5, v14 ; VI-NEXT: v_or_b32_e32 v4, v4, v13 ; VI-NEXT: v_or_b32_e32 v3, v3, v12 ; VI-NEXT: v_or_b32_e32 v2, v2, v11 @@ -35164,8 +35190,8 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; VI-NEXT: s_cbranch_execnz .LBB103_4 ; VI-NEXT: .LBB103_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_mov_b32_e32 v10, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v10 +; VI-NEXT: v_mov_b32_e32 v15, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v15 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 @@ -35173,7 +35199,7 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_add_f32_e32 v1, s4, v10 +; VI-NEXT: v_add_f32_e32 v1, s4, v15 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 @@ -35181,14 +35207,14 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_add_f32_e32 v2, s4, v15 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_add_f32_e32 v2, s4, v15 ; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -35198,7 +35224,7 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; VI-NEXT: v_add_f32_e32 v2, s4, v10 +; VI-NEXT: v_add_f32_e32 v2, s4, v15 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 @@ -35206,7 +35232,7 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v10 +; VI-NEXT: v_add_f32_e32 v3, s4, v15 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 @@ -35214,15 +35240,15 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_add_f32_e32 v4, s4, v15 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v4, s4, v10 -; VI-NEXT: v_cndmask_b32_e32 v11, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v15 +; VI-NEXT: v_cndmask_b32_e32 v10, v5, v6, vcc ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 @@ -35230,8 +35256,8 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; VI-NEXT: v_add_f32_e32 v4, s4, v10 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; VI-NEXT: v_add_f32_e32 v4, s4, v15 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 @@ -35239,79 +35265,79 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s4, v10 +; VI-NEXT: v_add_f32_e32 v5, s4, v15 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: s_and_b32 s5, s21, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v13, s5, v10 -; VI-NEXT: s_lshl_b32 s5, s22, 16 +; VI-NEXT: s_lshl_b32 s4, s21, 16 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s5, v10 +; VI-NEXT: v_add_f32_e32 v6, s4, v15 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v6, s4, v15 +; VI-NEXT: v_cndmask_b32_e32 v12, v7, v12, vcc +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v13, vcc +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; VI-NEXT: v_add_f32_e32 v6, s4, v15 ; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; VI-NEXT: v_or_b32_e32 v14, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v6, v7, v14, vcc -; VI-NEXT: v_add_f32_e32 v7, s5, v10 +; VI-NEXT: v_add_f32_e32 v7, s4, v15 ; VI-NEXT: v_bfe_u32 v14, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v7 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] -; VI-NEXT: v_cndmask_b32_e32 v7, v14, v15, vcc -; VI-NEXT: v_bfe_u32 v5, v13, 16, 1 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_cndmask_b32_e32 v7, v14, v16, vcc +; VI-NEXT: v_add_f32_e32 v14, s4, v15 +; VI-NEXT: v_bfe_u32 v16, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v14 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_add_f32_e32 v15, s4, v15 +; VI-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v13 -; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v13, s4, v10 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; VI-NEXT: v_bfe_u32 v7, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v13 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v7, v14, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v10 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v15, vcc -; VI-NEXT: v_add_f32_e32 v7, s4, v10 -; VI-NEXT: v_bfe_u32 v10, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v7 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v15, v10, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_lshrrev_b64 v[15:16], 16, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[13:14], 16, v[13:14] -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[14:15] +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[12:13] +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] ; VI-NEXT: v_lshrrev_b64 v[7:8], 16, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] ; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] ; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, v7 ; VI-NEXT: v_mov_b32_e32 v3, v10 -; VI-NEXT: v_mov_b32_e32 v5, v13 -; VI-NEXT: v_mov_b32_e32 v7, v15 +; VI-NEXT: v_mov_b32_e32 v5, v12 +; VI-NEXT: v_mov_b32_e32 v7, v14 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -35339,155 +35365,155 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 -; GFX9-NEXT: s_lshl_b32 s4, s16, 16 ; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_add_f32_e32 v0, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v0 -; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_lshl_b32 s4, s16, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 ; GFX9-NEXT: s_and_b32 s4, s17, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v2, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_add_f32_e32 v3, s4, v1 ; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 ; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: s_lshl_b32 s4, s17, 16 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: s_lshl_b32 s4, s19, 16 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 -; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 +; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 ; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX9-NEXT: v_add_f32_e32 v4, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v12, v5, v6, vcc ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v4 ; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: s_lshl_b32 s4, s18, 16 +; GFX9-NEXT: s_lshl_b32 s4, s20, 16 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 ; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 -; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v6, v7, vcc -; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 -; GFX9-NEXT: s_lshl_b32 s4, s19, 16 +; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 ; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v13, v6, v7, vcc ; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 -; GFX9-NEXT: s_and_b32 s5, s22, 0xffff0000 ; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_add_f32_e32 v5, s5, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v6, v7, vcc -; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v5 -; GFX9-NEXT: v_add_u32_e32 v6, 0x7fff, v6 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: s_lshl_b32 s5, s22, 16 +; GFX9-NEXT: s_lshl_b32 s4, s21, 16 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX9-NEXT: v_add_f32_e32 v6, s5, v1 +; GFX9-NEXT: v_add_f32_e32 v6, s4, v1 ; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 ; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 -; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v6 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: s_and_b32 s5, s23, 0xffff0000 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc -; GFX9-NEXT: v_add_f32_e32 v7, s5, v1 -; GFX9-NEXT: v_bfe_u32 v12, v7, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v12, v12, v7 -; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 -; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX9-NEXT: v_add_f32_e32 v6, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v7, v14, vcc +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: s_lshl_b32 s4, s22, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v15, vcc +; GFX9-NEXT: v_add_f32_e32 v7, s4, v1 +; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v15, v15, v7 +; GFX9-NEXT: s_and_b32 s4, s23, 0xffff0000 +; GFX9-NEXT: v_add_u32_e32 v15, 0x7fff, v15 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_add_f32_e32 v7, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v16, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v16, v16, v7 +; GFX9-NEXT: s_lshl_b32 s4, s23, 16 +; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: s_lshl_b32 s5, s23, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc -; GFX9-NEXT: v_add_f32_e32 v12, s5, v1 -; GFX9-NEXT: v_bfe_u32 v13, v12, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v13, v13, v12 -; GFX9-NEXT: v_add_u32_e32 v13, 0x7fff, v13 -; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; GFX9-NEXT: v_mov_b32_e32 v13, 0xffff -; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_and_b32_sdwa v6, v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NEXT: v_and_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v6, v5, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v5, s4, v1 -; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v12 -; GFX9-NEXT: v_bfe_u32 v12, v5, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v12, v12, v5 -; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 -; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: s_lshl_b32 s4, s21, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v12, v14, vcc -; GFX9-NEXT: v_add_f32_e32 v12, s4, v1 -; GFX9-NEXT: v_bfe_u32 v14, v12, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v12 -; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc -; GFX9-NEXT: s_and_b32 s4, s20, 0xffff0000 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_and_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v12, s4, v1 -; GFX9-NEXT: v_bfe_u32 v14, v12, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v12 -; GFX9-NEXT: s_lshl_b32 s4, s20, 16 -; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; GFX9-NEXT: v_add_f32_e32 v1, s4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc -; GFX9-NEXT: v_bfe_u32 v14, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v1 -; GFX9-NEXT: v_add_u32_e32 v14, 0x7fff, v14 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v1, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v16, v16, v1 +; GFX9-NEXT: v_add_u32_e32 v16, 0x7fff, v16 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v15, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; GFX9-NEXT: v_and_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v10 -; GFX9-NEXT: v_and_b32_sdwa v10, v13, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v8, v13, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v8 -; GFX9-NEXT: v_lshl_or_b32 v0, v16, 16, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; GFX9-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v16, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v9, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v8 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB103_3: ; GFX9-NEXT: s_branch .LBB103_2 @@ -36084,231 +36110,235 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { ; VI-LABEL: bitcast_v16f16_to_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 ; VI-NEXT: v_mov_b32_e32 v35, v5 ; VI-NEXT: v_mov_b32_e32 v34, v4 -; VI-NEXT: v_mov_b32_e32 v33, v3 -; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v37, v3 +; VI-NEXT: v_mov_b32_e32 v36, v2 +; VI-NEXT: v_mov_b32_e32 v39, v1 +; VI-NEXT: v_mov_b32_e32 v38, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB104_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 ; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 ; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; VI-NEXT: .LBB104_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB104_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v5, 0x200 -; VI-NEXT: v_add_f16_sdwa v14, v33, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v36, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; VI-NEXT: v_add_f16_e32 v33, 0x200, v33 -; VI-NEXT: v_add_f16_sdwa v10, v32, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_or_b32_e32 v12, v33, v8 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 -; VI-NEXT: v_add_f16_sdwa v22, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v1, v2 -; VI-NEXT: v_add_f16_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v32, v8 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; VI-NEXT: v_mov_b32_e32 v3, 0x200 +; VI-NEXT: v_add_f16_sdwa v14, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_add_f16_e32 v37, 0x200, v37 +; VI-NEXT: v_add_f16_sdwa v10, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v37, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; VI-NEXT: v_add_f16_e32 v36, 0x200, v36 +; VI-NEXT: v_add_f16_sdwa v22, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v6, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v36, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 ; VI-NEXT: v_add_f16_e32 v35, 0x200, v35 -; VI-NEXT: v_add_f16_sdwa v18, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v30, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v26, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 -; VI-NEXT: v_or_b32_e32 v9, v35, v8 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v18 +; VI-NEXT: v_add_f16_sdwa v18, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_add_f16_e32 v39, 0x200, v39 +; VI-NEXT: v_add_f16_sdwa v2, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v35, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 ; VI-NEXT: v_add_f16_e32 v34, 0x200, v34 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 -; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 -; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_or_b32_e32 v3, v0, v3 -; VI-NEXT: v_or_b32_e32 v8, v34, v8 -; VI-NEXT: v_or_b32_e32 v16, v7, v13 -; VI-NEXT: v_or_b32_e32 v15, v6, v5 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[15:16] -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[8:9] -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3 -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v8 +; VI-NEXT: v_add_f16_sdwa v30, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v26, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v39, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_add_f16_e32 v38, 0x200, v38 +; VI-NEXT: v_or_b32_e32 v15, v34, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v33 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v0, v38, v0 +; VI-NEXT: v_or_b32_e32 v24, v33, v4 +; VI-NEXT: v_or_b32_e32 v23, v32, v3 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[23:24] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v24 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v23 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; VI-NEXT: v_bfe_u32 v31, v30, 8, 8 ; VI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 -; VI-NEXT: v_bfe_u32 v37, v36, 8, 8 +; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; VI-NEXT: .LBB104_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mov_b32_e32 v8, v32 -; VI-NEXT: v_mov_b32_e32 v12, v33 +; VI-NEXT: v_mov_b32_e32 v0, v38 +; VI-NEXT: v_mov_b32_e32 v4, v39 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v12, v37 ; VI-NEXT: v_mov_b32_e32 v16, v34 ; VI-NEXT: v_mov_b32_e32 v20, v35 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v28, v7 -; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v6, v36 -; VI-NEXT: v_mov_b32_e32 v7, v37 +; VI-NEXT: v_mov_b32_e32 v24, v32 +; VI-NEXT: v_mov_b32_e32 v28, v33 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v16f16_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 -; GFX9-NEXT: v_mov_b32_e32 v35, v3 -; GFX9-NEXT: v_mov_b32_e32 v34, v2 +; GFX9-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-NEXT: v_mov_b32_e32 v35, v5 +; GFX9-NEXT: v_mov_b32_e32 v34, v4 +; GFX9-NEXT: v_mov_b32_e32 v37, v3 +; GFX9-NEXT: v_mov_b32_e32 v36, v2 +; GFX9-NEXT: v_mov_b32_e32 v39, v1 +; GFX9-NEXT: v_mov_b32_e32 v38, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr14 ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr3 -; GFX9-NEXT: ; implicit-def: $vgpr11 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB104_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: .LBB104_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB104_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 -; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v39, v39, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v38, v38, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v37, v37, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v36, v36, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v35, v35, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v34, v34, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v33, v33, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: .LBB104_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v34 -; GFX9-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-NEXT: v_mov_b32_e32 v16, v32 -; GFX9-NEXT: v_mov_b32_e32 v20, v33 -; GFX9-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, v38 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: v_mov_b32_e32 v0, v38 +; GFX9-NEXT: v_mov_b32_e32 v4, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v12, v37 +; GFX9-NEXT: v_mov_b32_e32 v16, v34 +; GFX9-NEXT: v_mov_b32_e32 v20, v35 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v16f16_to_v32i8: @@ -36535,54 +36565,54 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-LABEL: bitcast_v16f16_to_v32i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v32, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v40, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v55, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v22, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v42, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v41, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB105_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v24 ; SI-NEXT: v_or_b32_e32 v48, v16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 ; SI-NEXT: v_or_b32_e32 v49, v8, v1 ; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: v_or_b32_e32 v35, v52, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_or_b32_e32 v35, v39, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_or_b32_e32 v36, v39, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v37, v55, v2 +; SI-NEXT: v_or_b32_e32 v36, v32, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v37, v52, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v38, v54, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_or_b32_e32 v38, v51, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v33, v54, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v34, v0, v2 ; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 ; SI-NEXT: v_lshr_b64 v[11:12], v[35:36], 24 -; SI-NEXT: v_or_b32_e32 v33, v41, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshr_b64 v[19:20], v[37:38], 24 +; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 ; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 ; SI-NEXT: v_lshr_b64 v[12:13], v[35:36], 16 -; SI-NEXT: v_or_b32_e32 v34, v0, v2 -; SI-NEXT: v_lshr_b64 v[24:25], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[33:34], 16 ; SI-NEXT: v_lshr_b64 v[9:10], v[35:36], 8 +; SI-NEXT: v_lshr_b64 v[17:18], v[37:38], 8 +; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 ; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v36 ; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v38 @@ -36591,15 +36621,10 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 -; SI-NEXT: v_lshr_b64 v[19:20], v[37:38], 24 -; SI-NEXT: v_lshr_b64 v[17:18], v[37:38], 8 -; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 -; SI-NEXT: v_lshr_b64 v[50:51], v[33:34], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 ; SI-NEXT: s_cbranch_execnz .LBB105_3 ; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -36610,8 +36635,8 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v33, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -36623,28 +36648,28 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v37, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 ; SI-NEXT: v_or_b32_e32 v38, v0, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_or_b32_e32 v35, v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 ; SI-NEXT: v_or_b32_e32 v36, v1, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 @@ -36664,15 +36689,15 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_or_b32_e32 v49, v2, v0 ; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 ; SI-NEXT: v_lshr_b64 v[11:12], v[35:36], 24 -; SI-NEXT: v_lshr_b64 v[24:25], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[37:38], 24 +; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 ; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 ; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 ; SI-NEXT: v_lshr_b64 v[12:13], v[35:36], 16 ; SI-NEXT: v_lshr_b64 v[9:10], v[35:36], 8 -; SI-NEXT: v_lshr_b64 v[19:20], v[37:38], 24 +; SI-NEXT: v_lshr_b64 v[20:21], v[37:38], 16 ; SI-NEXT: v_lshr_b64 v[17:18], v[37:38], 8 -; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 -; SI-NEXT: v_lshr_b64 v[50:51], v[33:34], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[33:34], 16 ; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 ; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v36 @@ -36683,9 +36708,6 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 ; SI-NEXT: .LBB105_3: ; %end -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v0, v48 ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v4, v49 @@ -36693,12 +36715,11 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v10, v12 ; SI-NEXT: v_mov_b32_e32 v12, v36 ; SI-NEXT: v_mov_b32_e32 v16, v37 -; SI-NEXT: v_mov_b32_e32 v18, v24 +; SI-NEXT: v_mov_b32_e32 v18, v20 ; SI-NEXT: v_mov_b32_e32 v20, v38 ; SI-NEXT: v_mov_b32_e32 v24, v33 -; SI-NEXT: v_mov_b32_e32 v26, v50 +; SI-NEXT: v_mov_b32_e32 v26, v28 ; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB105_4: ; SI-NEXT: ; implicit-def: $vgpr48 @@ -36709,22 +36730,22 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_branch .LBB105_2 ; ; VI-LABEL: bitcast_v16f16_to_v32i8_scalar: @@ -36763,55 +36784,55 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v1, 0x200 ; VI-NEXT: v_add_f16_e32 v6, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_add_f16_e32 v35, s17, v1 ; VI-NEXT: v_add_f16_e32 v2, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v37, v35, v0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f16_e32 v0, s16, v1 ; VI-NEXT: v_add_f16_e32 v14, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 +; VI-NEXT: v_or_b32_e32 v36, v0, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 ; VI-NEXT: v_add_f16_e32 v34, s19, v1 ; VI-NEXT: v_add_f16_e32 v10, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s21, 16 -; VI-NEXT: v_or_b32_e32 v12, v34, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; VI-NEXT: v_or_b32_e32 v39, v34, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 ; VI-NEXT: v_add_f16_e32 v8, s18, v1 ; VI-NEXT: v_add_f16_e32 v22, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s20, 16 -; VI-NEXT: v_or_b32_e32 v11, v8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; VI-NEXT: v_or_b32_e32 v38, v8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 ; VI-NEXT: v_add_f16_e32 v33, s21, v1 ; VI-NEXT: v_add_f16_e32 v18, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s23, 16 -; VI-NEXT: v_or_b32_e32 v20, v33, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; VI-NEXT: v_or_b32_e32 v49, v33, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 ; VI-NEXT: v_add_f16_e32 v16, s20, v1 ; VI-NEXT: v_add_f16_e32 v30, s4, v1 ; VI-NEXT: s_lshr_b32 s4, s22, 16 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; VI-NEXT: v_add_f16_e32 v35, s17, v1 -; VI-NEXT: v_or_b32_e32 v19, v16, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v30 +; VI-NEXT: v_or_b32_e32 v48, v16, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 ; VI-NEXT: v_add_f16_e32 v32, s23, v1 ; VI-NEXT: v_add_f16_e32 v26, s4, v1 -; VI-NEXT: v_or_b32_e32 v4, v35, v0 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; VI-NEXT: v_add_f16_e32 v0, s16, v1 -; VI-NEXT: v_or_b32_e32 v37, v32, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; VI-NEXT: v_or_b32_e32 v51, v32, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 ; VI-NEXT: v_add_f16_e32 v24, s22, v1 -; VI-NEXT: v_or_b32_e32 v3, v0, v3 -; VI-NEXT: v_or_b32_e32 v36, v24, v5 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[36:37] -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v19 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[19:20] -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v37 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v36 +; VI-NEXT: v_or_b32_e32 v50, v24, v3 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[50:51] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[48:49] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[38:39] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[36:37] +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v51 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v50 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v48 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v38 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 ; VI-NEXT: v_bfe_u32 v31, v30, 8, 8 ; VI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 @@ -36915,39 +36936,39 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB105_4 ; GFX9-NEXT: .LBB105_2: ; %cmp.true -; GFX9-NEXT: v_mov_b32_e32 v2, 0x200 -; GFX9-NEXT: v_pk_add_f16 v1, s17, v2 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, s16, v2 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v9, s19, v2 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, s18, v2 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v17, s21, v2 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, s20, v2 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v25, s23, v2 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v24, s22, v2 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x200 +; GFX9-NEXT: v_pk_add_f16 v39, s17, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v38, s16, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v37, s19, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v36, s18, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v35, s21, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v34, s20, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v33, s23, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, s22, v0 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: s_branch .LBB105_5 ; GFX9-NEXT: .LBB105_3: ; GFX9-NEXT: ; implicit-def: $sgpr59 @@ -36976,30 +36997,30 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr14 ; GFX9-NEXT: s_branch .LBB105_2 ; GFX9-NEXT: .LBB105_4: -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v8, s18 -; GFX9-NEXT: v_mov_b32_e32 v9, s19 -; GFX9-NEXT: v_mov_b32_e32 v16, s20 -; GFX9-NEXT: v_mov_b32_e32 v17, s21 -; GFX9-NEXT: v_mov_b32_e32 v24, s22 -; GFX9-NEXT: v_mov_b32_e32 v25, s23 -; GFX9-NEXT: v_mov_b32_e32 v35, s59 +; GFX9-NEXT: v_mov_b32_e32 v38, s16 +; GFX9-NEXT: v_mov_b32_e32 v39, s17 +; GFX9-NEXT: v_mov_b32_e32 v36, s18 +; GFX9-NEXT: v_mov_b32_e32 v37, s19 +; GFX9-NEXT: v_mov_b32_e32 v34, s20 +; GFX9-NEXT: v_mov_b32_e32 v35, s21 +; GFX9-NEXT: v_mov_b32_e32 v32, s22 +; GFX9-NEXT: v_mov_b32_e32 v33, s23 +; GFX9-NEXT: v_mov_b32_e32 v1, s59 ; GFX9-NEXT: v_mov_b32_e32 v2, s57 ; GFX9-NEXT: v_mov_b32_e32 v5, s58 ; GFX9-NEXT: v_mov_b32_e32 v6, s56 ; GFX9-NEXT: v_mov_b32_e32 v7, s47 -; GFX9-NEXT: v_mov_b32_e32 v34, s46 +; GFX9-NEXT: v_mov_b32_e32 v9, s46 ; GFX9-NEXT: v_mov_b32_e32 v10, s44 ; GFX9-NEXT: v_mov_b32_e32 v13, s45 ; GFX9-NEXT: v_mov_b32_e32 v14, s43 ; GFX9-NEXT: v_mov_b32_e32 v15, s42 -; GFX9-NEXT: v_mov_b32_e32 v33, s41 +; GFX9-NEXT: v_mov_b32_e32 v17, s41 ; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: v_mov_b32_e32 v21, s40 ; GFX9-NEXT: v_mov_b32_e32 v22, s28 ; GFX9-NEXT: v_mov_b32_e32 v23, s27 -; GFX9-NEXT: v_mov_b32_e32 v32, s26 +; GFX9-NEXT: v_mov_b32_e32 v25, s26 ; GFX9-NEXT: v_mov_b32_e32 v26, s24 ; GFX9-NEXT: v_mov_b32_e32 v29, s25 ; GFX9-NEXT: v_mov_b32_e32 v30, s15 @@ -37009,14 +37030,14 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v11, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: .LBB105_5: ; %end -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v12, v9 -; GFX9-NEXT: v_mov_b32_e32 v20, v17 -; GFX9-NEXT: v_mov_b32_e32 v28, v25 -; GFX9-NEXT: v_mov_b32_e32 v1, v35 -; GFX9-NEXT: v_mov_b32_e32 v9, v34 -; GFX9-NEXT: v_mov_b32_e32 v17, v33 -; GFX9-NEXT: v_mov_b32_e32 v25, v32 +; GFX9-NEXT: v_mov_b32_e32 v0, v38 +; GFX9-NEXT: v_mov_b32_e32 v4, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v12, v37 +; GFX9-NEXT: v_mov_b32_e32 v16, v34 +; GFX9-NEXT: v_mov_b32_e32 v20, v35 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: bitcast_v16f16_to_v32i8_scalar: @@ -38313,24 +38334,6 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_cbranch_scc0 .LBB107_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v20, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -38348,19 +38351,37 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_or_b32 s6, s5, s6 +; VI-NEXT: s_and_b32 s5, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s7, s5, s7 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s5, v0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: s_cbranch_execnz .LBB107_3 ; VI-NEXT: .LBB107_2: ; %cmp.true ; VI-NEXT: s_add_i32 s28, s28, 3 @@ -38469,53 +38490,53 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 ; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s29, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v20, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s5, s6 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s5, s7 +; GFX9-NEXT: s_and_b32 s5, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_or_b32_sdwa v4, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_cbranch_execnz .LBB107_3 ; GFX9-NEXT: .LBB107_2: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v3, 3, v12 @@ -39157,480 +39178,484 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; VI-LABEL: bitcast_v16bf16_to_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_mov_b32_e32 v33, v7 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v35, v5 +; VI-NEXT: v_mov_b32_e32 v34, v4 +; VI-NEXT: v_mov_b32_e32 v37, v3 +; VI-NEXT: v_mov_b32_e32 v36, v2 +; VI-NEXT: v_mov_b32_e32 v39, v1 +; VI-NEXT: v_mov_b32_e32 v38, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 ; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr14 ; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB108_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; VI-NEXT: .LBB108_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB108_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: s_movk_i32 s6, 0x7fff -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_alignbit_b32 v39, v1, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v35 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v35, v3, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v34, v3, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v33, v3, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v32, v3, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v38 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v38, v1, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v36, v1, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v35, v1, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v33, v1, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v32, v1, v0, 16 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; VI-NEXT: .LBB108_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mov_b32_e32 v8, v34 -; VI-NEXT: v_mov_b32_e32 v12, v35 -; VI-NEXT: v_mov_b32_e32 v16, v32 -; VI-NEXT: v_mov_b32_e32 v20, v33 -; VI-NEXT: v_mov_b32_e32 v24, v6 -; VI-NEXT: v_mov_b32_e32 v28, v7 -; VI-NEXT: v_mov_b32_e32 v1, v38 -; VI-NEXT: v_mov_b32_e32 v6, v37 -; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: v_mov_b32_e32 v0, v38 +; VI-NEXT: v_mov_b32_e32 v4, v39 +; VI-NEXT: v_mov_b32_e32 v8, v36 +; VI-NEXT: v_mov_b32_e32 v12, v37 +; VI-NEXT: v_mov_b32_e32 v16, v34 +; VI-NEXT: v_mov_b32_e32 v20, v35 +; VI-NEXT: v_mov_b32_e32 v24, v32 +; VI-NEXT: v_mov_b32_e32 v28, v33 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v16bf16_to_v32i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v33, v5 -; GFX9-NEXT: v_mov_b32_e32 v32, v4 -; GFX9-NEXT: v_mov_b32_e32 v35, v3 -; GFX9-NEXT: v_mov_b32_e32 v34, v2 +; GFX9-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-NEXT: v_mov_b32_e32 v32, v6 +; GFX9-NEXT: v_mov_b32_e32 v35, v5 +; GFX9-NEXT: v_mov_b32_e32 v34, v4 +; GFX9-NEXT: v_mov_b32_e32 v37, v3 +; GFX9-NEXT: v_mov_b32_e32 v36, v2 +; GFX9-NEXT: v_mov_b32_e32 v39, v1 +; GFX9-NEXT: v_mov_b32_e32 v38, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr13 ; GFX9-NEXT: ; implicit-def: $vgpr14 ; GFX9-NEXT: ; implicit-def: $vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr31 -; GFX9-NEXT: ; implicit-def: $vgpr3 -; GFX9-NEXT: ; implicit-def: $vgpr11 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB108_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v38 ; GFX9-NEXT: .LBB108_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB108_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v39 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc +; GFX9-NEXT: s_movk_i32 s6, 0x7fff ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: s_mov_b32 s7, 0x7060302 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: v_perm_b32 v3, v0, v5, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 ; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 -; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v34 -; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v10, v10, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v34 -; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v10, v11, vcc -; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v10, v10, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; GFX9-NEXT: v_perm_b32 v11, v9, v13, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v33 -; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v10, v10, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v33 -; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v10, v14, vcc -; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v10, v10, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v10, v15, vcc -; GFX9-NEXT: v_bfe_u32 v15, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v32 -; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v15, v16, vcc -; GFX9-NEXT: v_bfe_u32 v15, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v15, v16, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v16, v19, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v7, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v16, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v15, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v16, v20, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v6, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v16, vcc -; GFX9-NEXT: v_perm_b32 v4, v8, v2, s7 -; GFX9-NEXT: v_perm_b32 v12, v1, v0, s7 -; GFX9-NEXT: v_perm_b32 v10, v17, v14, s7 -; GFX9-NEXT: v_perm_b32 v9, v9, v18, s7 -; GFX9-NEXT: v_perm_b32 v16, v7, v19, s7 -; GFX9-NEXT: v_perm_b32 v15, v6, v20, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v15 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[15:16] -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v9 -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v11 -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v3 -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v2, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v3, v8, vcc +; GFX9-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v8, v8, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v8, v9, vcc +; GFX9-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v8, v8, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v35 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v10, v10, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v34 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v10, v11, vcc +; GFX9-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v10, v10, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v33 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_bfe_u32 v11, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v11, v11, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v33 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v11, v17, vcc +; GFX9-NEXT: v_bfe_u32 v11, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v11, v11, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_bfe_u32 v19, v10, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v11, v18, vcc +; GFX9-NEXT: v_add3_u32 v19, v19, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v19, v20, vcc +; GFX9-NEXT: v_bfe_u32 v20, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v20, v20, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v20, v21, vcc +; GFX9-NEXT: v_perm_b32 v1, v5, v4, s7 +; GFX9-NEXT: v_perm_b32 v0, v0, v7, s7 +; GFX9-NEXT: v_perm_b32 v3, v12, v6, s7 +; GFX9-NEXT: v_perm_b32 v2, v2, v13, s7 +; GFX9-NEXT: v_perm_b32 v9, v15, v14, s7 +; GFX9-NEXT: v_perm_b32 v8, v8, v16, s7 +; GFX9-NEXT: v_perm_b32 v11, v18, v17, s7 +; GFX9-NEXT: v_perm_b32 v10, v10, v19, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[10:11] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v11 +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: .LBB108_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v34 -; GFX9-NEXT: v_mov_b32_e32 v12, v35 -; GFX9-NEXT: v_mov_b32_e32 v16, v32 -; GFX9-NEXT: v_mov_b32_e32 v20, v33 -; GFX9-NEXT: v_mov_b32_e32 v24, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, v38 -; GFX9-NEXT: v_mov_b32_e32 v6, v37 -; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: v_mov_b32_e32 v0, v38 +; GFX9-NEXT: v_mov_b32_e32 v4, v39 +; GFX9-NEXT: v_mov_b32_e32 v8, v36 +; GFX9-NEXT: v_mov_b32_e32 v12, v37 +; GFX9-NEXT: v_mov_b32_e32 v16, v34 +; GFX9-NEXT: v_mov_b32_e32 v20, v35 +; GFX9-NEXT: v_mov_b32_e32 v24, v32 +; GFX9-NEXT: v_mov_b32_e32 v28, v33 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v32i8: @@ -40135,205 +40160,202 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e64 v46, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v45, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e64 v56, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v16, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v47, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v13, 1.0, s22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v57, 1.0, s27 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v21, 1.0, s26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v32, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v0 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v46 -; SI-NEXT: v_lshr_b64 v[53:54], v[16:17], 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v47 -; SI-NEXT: v_lshr_b64 v[39:40], v[24:25], 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v60 -; SI-NEXT: v_lshr_b64 v[50:51], v[8:9], 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v45 -; SI-NEXT: v_lshr_b64 v[54:55], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[42:43], v[32:33], 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59 -; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[43:44], v[29:30], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[53:54], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[53:54], 8 -; SI-NEXT: v_lshr_b64 v[19:20], v[39:40], 24 -; SI-NEXT: v_lshr_b64 v[34:35], v[39:40], 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 24 -; SI-NEXT: v_lshr_b64 v[48:49], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[50:51], 8 -; SI-NEXT: v_lshr_b64 v[11:12], v[53:54], 24 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v45 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v47 -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v51 -; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v57 -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v54 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v59 -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v40 -; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v43 -; SI-NEXT: v_lshr_b64 v[17:18], v[39:40], 8 -; SI-NEXT: v_lshr_b64 v[27:28], v[42:43], 24 -; SI-NEXT: v_lshr_b64 v[37:38], v[42:43], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[42:43], 8 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v47 +; SI-NEXT: v_lshr_b64 v[51:52], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v55 +; SI-NEXT: v_lshr_b64 v[52:53], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[24:25], 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v56 +; SI-NEXT: v_lshr_b64 v[43:44], v[32:33], 16 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v58 +; SI-NEXT: v_lshr_b64 v[48:49], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v54 +; SI-NEXT: v_lshr_b64 v[41:42], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[51:52], 8 +; SI-NEXT: v_lshr_b64 v[35:36], v[40:41], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[40:41], 8 +; SI-NEXT: v_lshr_b64 v[25:26], v[43:44], 8 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 +; SI-NEXT: v_lshr_b64 v[38:39], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[51:52], 24 +; SI-NEXT: v_lshr_b64 v[19:20], v[40:41], 24 +; SI-NEXT: v_lshr_b64 v[27:28], v[43:44], 24 +; SI-NEXT: v_lshr_b64 v[36:37], v[43:44], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v54 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v46 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v49 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v56 +; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v52 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v58 +; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v41 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v44 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[42:43], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[0:1], 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[39:40], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[0:1], 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[53:54], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[0:1], 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v47 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[50:51], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[0:1], 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshr_b64 v[52:53], v[13:14], 16 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_lshr_b64 v[41:42], v[21:22], 16 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; SI-NEXT: v_lshr_b64 v[54:55], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[43:44], v[29:30], 16 -; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[53:54], 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 24 -; SI-NEXT: v_lshr_b64 v[48:49], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[50:51], 8 -; SI-NEXT: v_lshr_b64 v[11:12], v[53:54], 24 -; SI-NEXT: v_lshr_b64 v[9:10], v[53:54], 8 -; SI-NEXT: v_lshr_b64 v[19:20], v[39:40], 24 -; SI-NEXT: v_lshr_b64 v[34:35], v[39:40], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[39:40], 8 -; SI-NEXT: v_lshr_b64 v[27:28], v[42:43], 24 -; SI-NEXT: v_lshr_b64 v[37:38], v[42:43], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[42:43], 8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v51 -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v54 -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v40 -; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v43 +; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[51:52], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[40:41], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 +; SI-NEXT: v_lshr_b64 v[38:39], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[51:52], 24 +; SI-NEXT: v_lshr_b64 v[9:10], v[51:52], 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[40:41], 24 +; SI-NEXT: v_lshr_b64 v[17:18], v[40:41], 8 +; SI-NEXT: v_lshr_b64 v[27:28], v[43:44], 24 +; SI-NEXT: v_lshr_b64 v[36:37], v[43:44], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[43:44], 8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v49 +; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v52 +; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v41 +; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v44 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 ; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v23 ; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: v_mov_b32_e32 v13, v20 -; SI-NEXT: v_mov_b32_e32 v20, v40 -; SI-NEXT: v_mov_b32_e32 v24, v42 -; SI-NEXT: v_mov_b32_e32 v28, v43 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v0, v50 -; SI-NEXT: v_mov_b32_e32 v2, v48 -; SI-NEXT: v_mov_b32_e32 v4, v51 +; SI-NEXT: v_mov_b32_e32 v16, v40 +; SI-NEXT: v_mov_b32_e32 v20, v41 +; SI-NEXT: v_mov_b32_e32 v24, v43 +; SI-NEXT: v_mov_b32_e32 v28, v44 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, v48 +; SI-NEXT: v_mov_b32_e32 v2, v38 +; SI-NEXT: v_mov_b32_e32 v4, v49 ; SI-NEXT: v_mov_b32_e32 v5, v10 -; SI-NEXT: v_mov_b32_e32 v8, v53 -; SI-NEXT: v_mov_b32_e32 v10, v36 -; SI-NEXT: v_mov_b32_e32 v12, v54 -; SI-NEXT: v_mov_b32_e32 v16, v39 -; SI-NEXT: v_mov_b32_e32 v18, v34 -; SI-NEXT: v_mov_b32_e32 v21, v33 -; SI-NEXT: v_mov_b32_e32 v26, v37 -; SI-NEXT: v_mov_b32_e32 v29, v35 +; SI-NEXT: v_mov_b32_e32 v8, v51 +; SI-NEXT: v_mov_b32_e32 v10, v34 +; SI-NEXT: v_mov_b32_e32 v12, v52 +; SI-NEXT: v_mov_b32_e32 v13, v18 +; SI-NEXT: v_mov_b32_e32 v18, v35 +; SI-NEXT: v_mov_b32_e32 v21, v26 +; SI-NEXT: v_mov_b32_e32 v26, v36 +; SI-NEXT: v_mov_b32_e32 v29, v33 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v16bf16_to_v32i8_scalar: @@ -40510,17 +40532,17 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; VI-NEXT: v_mov_b32_e32 v25, v28 -; VI-NEXT: v_mov_b32_e32 v1, v4 -; VI-NEXT: v_mov_b32_e32 v9, v12 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_mov_b32_e32 v17, v20 -; VI-NEXT: v_lshrrev_b64 v[36:37], 24, v[24:25] -; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[24:25] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] +; VI-NEXT: v_mov_b32_e32 v9, v12 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[16:17] +; VI-NEXT: v_mov_b32_e32 v1, v4 ; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[8:9] -; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[0:1] ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 @@ -40569,6 +40591,14 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: s_branch .LBB109_2 ; VI-NEXT: .LBB109_4: +; VI-NEXT: v_mov_b32_e32 v24, s22 +; VI-NEXT: v_mov_b32_e32 v28, s23 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v12, s19 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v4, s17 ; VI-NEXT: v_mov_b32_e32 v26, s59 ; VI-NEXT: v_mov_b32_e32 v25, s58 ; VI-NEXT: v_mov_b32_e32 v31, s57 @@ -40589,23 +40619,15 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_mov_b32_e32 v7, s24 ; VI-NEXT: v_mov_b32_e32 v6, s15 ; VI-NEXT: v_mov_b32_e32 v5, s14 -; VI-NEXT: v_mov_b32_e32 v24, s22 -; VI-NEXT: v_mov_b32_e32 v28, s23 -; VI-NEXT: v_mov_b32_e32 v16, s20 -; VI-NEXT: v_mov_b32_e32 v20, s21 -; VI-NEXT: v_mov_b32_e32 v8, s18 -; VI-NEXT: v_mov_b32_e32 v12, s19 -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v4, s17 -; VI-NEXT: v_mov_b32_e32 v36, s10 -; VI-NEXT: v_mov_b32_e32 v37, s8 +; VI-NEXT: v_mov_b32_e32 v32, s10 +; VI-NEXT: v_mov_b32_e32 v33, s8 ; VI-NEXT: v_mov_b32_e32 v34, s6 -; VI-NEXT: v_mov_b32_e32 v32, s4 +; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: .LBB109_5: ; %end -; VI-NEXT: v_mov_b32_e32 v3, v32 +; VI-NEXT: v_mov_b32_e32 v3, v35 ; VI-NEXT: v_mov_b32_e32 v11, v34 -; VI-NEXT: v_mov_b32_e32 v19, v37 -; VI-NEXT: v_mov_b32_e32 v27, v36 +; VI-NEXT: v_mov_b32_e32 v19, v33 +; VI-NEXT: v_mov_b32_e32 v27, v32 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v16bf16_to_v32i8_scalar: @@ -40661,55 +40683,55 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: s_and_b32 s4, s16, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v4, v6, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v0, s4, v5 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x7fff, v1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: s_lshl_b32 s4, s16, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_add_f32_e32 v0, s4, v5 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX9-NEXT: s_and_b32 s4, s19, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v5 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v7, vcc +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v5 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc ; GFX9-NEXT: s_lshl_b32 s4, s19, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v5 -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 0x7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v7, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v5 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v34 ; GFX9-NEXT: s_and_b32 s4, s18, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v2, v14, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v1, s4, v5 -; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v1 +; GFX9-NEXT: v_lshl_or_b32 v4, v14, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v3, s4, v5 +; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v3 ; GFX9-NEXT: v_add_u32_e32 v7, 0x7fff, v7 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: s_lshl_b32 s4, s18, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 ; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v8, v8, v7 @@ -40718,10 +40740,10 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v8 ; GFX9-NEXT: s_and_b32 s4, s21, 0xffff0000 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v7 ; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 ; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 ; GFX9-NEXT: v_add_u32_e32 v9, v9, v7 @@ -40777,50 +40799,50 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; GFX9-NEXT: v_add_u32_e32 v11, 0x7fff, v11 ; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc -; GFX9-NEXT: v_add_f32_e32 v11, s4, v5 -; GFX9-NEXT: v_bfe_u32 v12, v11, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v12, v12, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX9-NEXT: s_and_b32 s4, s22, 0xffff0000 +; GFX9-NEXT: v_lshl_or_b32 v12, v30, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v7, s4, v5 +; GFX9-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v11, v11, v7 ; GFX9-NEXT: s_lshl_b32 s4, s22, 16 -; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 -; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_add_u32_e32 v11, 0x7fff, v11 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_add_f32_e32 v5, s4, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; GFX9-NEXT: v_bfe_u32 v12, v5, 16, 1 -; GFX9-NEXT: v_add_u32_e32 v12, v12, v5 -; GFX9-NEXT: v_add_u32_e32 v12, 0x7fff, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc +; GFX9-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX9-NEXT: v_add_u32_e32 v11, v11, v5 +; GFX9-NEXT: v_add_u32_e32 v11, 0x7fff, v11 ; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v12, v13, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v11, v11, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v12, v30, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v11, v7, 16, v5 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[11:12] ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v11 -; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] ; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v9 -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: s_branch .LBB109_5 ; GFX9-NEXT: .LBB109_3: ; GFX9-NEXT: ; implicit-def: $sgpr25 @@ -42658,24 +42680,6 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_cbranch_scc0 .LBB111_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v20, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -42693,19 +42697,37 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_or_b32 s6, s5, s6 +; VI-NEXT: s_and_b32 s5, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_or_b32 s7, s5, s7 +; VI-NEXT: s_and_b32 s5, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s5, s5, s8 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s5, v0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v21, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: s_cbranch_execnz .LBB111_3 ; VI-NEXT: .LBB111_2: ; %cmp.true ; VI-NEXT: s_add_i32 s28, s28, 3 @@ -42814,53 +42836,53 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 ; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s29, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v20, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 +; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s5, s6 +; GFX9-NEXT: s_and_b32 s5, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s7 +; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s5, s7 +; GFX9-NEXT: s_and_b32 s5, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_or_b32_sdwa v4, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_cbranch_execnz .LBB111_3 ; GFX9-NEXT: .LBB111_2: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v3, 3, v12 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll index 4e60831ca3da5..215ce32c0a2d7 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll @@ -1755,36 +1755,36 @@ define <9 x i32> @bitcast_v18f16_to_v9i32(<18 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 @@ -1979,36 +1979,36 @@ define inreg <9 x i32> @bitcast_v18f16_to_v9i32_scalar(<18 x half> inreg %a, i32 ; SI-NEXT: s_cbranch_execnz .LBB11_3 ; SI-NEXT: .LBB11_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 @@ -3515,36 +3515,36 @@ define <9 x float> @bitcast_v18f16_to_v9f32(<18 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 @@ -3739,36 +3739,36 @@ define inreg <9 x float> @bitcast_v18f16_to_v9f32_scalar(<18 x half> inreg %a, i ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v21 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v17 @@ -4663,28 +4663,29 @@ define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i ; SI-LABEL: bitcast_v18f16_to_v18i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: v_mov_b32_e32 v9, v2 -; SI-NEXT: v_mov_b32_e32 v10, v0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v5, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v9 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -4695,82 +4696,82 @@ define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_or_b32_e32 v14, v14, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_or_b32_e32 v10, v10, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v6, v6, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v6, v6, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v2, v2, v18 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v2, v2, v18 -; SI-NEXT: v_lshr_b64 v[18:19], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v17 +; SI-NEXT: v_lshr_b64 v[22:23], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[13:14], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v16, v16, v24 ; SI-NEXT: .LBB23_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v23 -; SI-NEXT: v_mov_b32_e32 v5, v18 -; SI-NEXT: v_mov_b32_e32 v9, v21 -; SI-NEXT: v_mov_b32_e32 v13, v19 +; SI-NEXT: v_mov_b32_e32 v1, v20 +; SI-NEXT: v_mov_b32_e32 v5, v21 +; SI-NEXT: v_mov_b32_e32 v9, v22 +; SI-NEXT: v_mov_b32_e32 v13, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: s_branch .LBB23_2 @@ -4783,51 +4784,51 @@ define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s5, s23, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x200 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s5, s24, 16 -; VI-NEXT: v_add_f16_e32 v1, s23, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_lshr_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s22, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s21, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v3, s24, v0 -; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, s20, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s19, 16 -; VI-NEXT: v_or_b32_e32 v8, v3, v4 -; VI-NEXT: v_or_b32_e32 v4, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s19, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_add_f16_e32 v1, s18, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v1, v2 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_add_f16_sdwa v10, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v18, s4 ; VI-NEXT: v_add_f16_e32 v9, s16, v0 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s17, v0 -; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_add_f16_sdwa v10, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v11, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v12, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v13, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_add_f16_sdwa v14, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s21, v0 +; VI-NEXT: v_add_f16_sdwa v15, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s22, v0 +; VI-NEXT: v_add_f16_sdwa v16, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s23, v0 +; VI-NEXT: v_add_f16_sdwa v17, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s24, v0 +; VI-NEXT: v_add_f16_sdwa v0, v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v0 +; VI-NEXT: v_or_b32_e32 v7, v7, v17 +; VI-NEXT: v_or_b32_e32 v6, v6, v16 +; VI-NEXT: v_or_b32_e32 v5, v5, v15 +; VI-NEXT: v_or_b32_e32 v4, v4, v14 +; VI-NEXT: v_or_b32_e32 v3, v3, v13 +; VI-NEXT: v_or_b32_e32 v2, v2, v12 +; VI-NEXT: v_or_b32_e32 v1, v1, v11 ; VI-NEXT: v_or_b32_e32 v0, v9, v10 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 6fae7fdbbf9bb..7361108b2e52a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -2621,10 +2621,10 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB12_2 @@ -2705,10 +2705,10 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; VI-NEXT: .LBB12_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 @@ -2807,10 +2807,10 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB12_2 @@ -2891,10 +2891,10 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX9-NEXT: .LBB12_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 @@ -3393,6 +3393,11 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[28:29], s[20:21], 16 ; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 8 ; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 8 ; SI-NEXT: s_lshr_b32 s72, s25, 24 ; SI-NEXT: s_lshr_b32 s73, s25, 16 ; SI-NEXT: s_lshr_b32 s74, s25, 8 @@ -3408,20 +3413,15 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s92, s17, 24 ; SI-NEXT: s_lshr_b32 s93, s17, 16 ; SI-NEXT: s_lshr_b32 s94, s17, 8 -; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 8 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s5, s60, 8 -; SI-NEXT: s_and_b32 s7, s16, 0xff -; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_and_b32 s5, s16, 0xff +; SI-NEXT: s_lshl_b32 s7, s60, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: s_and_b32 s7, s58, 0xff -; SI-NEXT: s_lshl_b32 s9, s56, 24 ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: s_lshl_b32 s9, s56, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_and_b32 s5, s17, 0xff @@ -3430,13 +3430,13 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 ; SI-NEXT: s_and_b32 s7, s93, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s9, s92, 24 -; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: s_lshl_b32 s5, s46, 8 -; SI-NEXT: s_and_b32 s7, s18, 0xff -; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s7, s46, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: s_and_b32 s7, s44, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s9, s42, 24 @@ -3570,10 +3570,10 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v10i32_to_v40i8_scalar: @@ -5541,6 +5541,41 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v25 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_or_b32_e32 v3, s7, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 ; SI-NEXT: v_or_b32_e32 v0, v0, v38 @@ -5548,6 +5583,13 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v37, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 ; SI-NEXT: v_or_b32_e32 v0, v0, v34 @@ -5575,49 +5617,7 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v31 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: v_or_b32_e32 v3, s4, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s20, 0xff -; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s22, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s24, 0xff -; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s26, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -5774,30 +5774,6 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v25 ; VI-NEXT: s_cbranch_scc0 .LBB15_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -5825,6 +5801,30 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -5955,30 +5955,6 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v25 ; GFX9-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s29, 8 -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -6006,6 +5982,30 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s7, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 @@ -9502,10 +9502,10 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB32_2 @@ -9586,10 +9586,10 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; VI-NEXT: .LBB32_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 @@ -9688,10 +9688,10 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB32_2 @@ -9772,10 +9772,10 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX9-NEXT: .LBB32_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 @@ -10324,10 +10324,10 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr93 ; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB33_2 ; SI-NEXT: .LBB33_4: ; SI-NEXT: v_mov_b32_e32 v13, s16 @@ -10553,8 +10553,8 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v9 ; VI-NEXT: s_branch .LBB33_5 ; VI-NEXT: .LBB33_3: ; VI-NEXT: ; implicit-def: $sgpr76 @@ -10599,8 +10599,8 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v4, s23 ; VI-NEXT: v_mov_b32_e32 v1, s24 ; VI-NEXT: v_mov_b32_e32 v2, s25 -; VI-NEXT: v_mov_b32_e32 v39, s76 -; VI-NEXT: v_mov_b32_e32 v48, s74 +; VI-NEXT: v_mov_b32_e32 v48, s76 +; VI-NEXT: v_mov_b32_e32 v39, s74 ; VI-NEXT: v_mov_b32_e32 v38, s75 ; VI-NEXT: v_mov_b32_e32 v36, s73 ; VI-NEXT: v_mov_b32_e32 v37, s72 @@ -10630,10 +10630,10 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v12, s10 ; VI-NEXT: v_mov_b32_e32 v11, s12 ; VI-NEXT: .LBB33_5: ; %end +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v48 ; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v39 -; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v39, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v38 @@ -10778,8 +10778,8 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v9 ; GFX9-NEXT: s_branch .LBB33_5 ; GFX9-NEXT: .LBB33_3: ; GFX9-NEXT: ; implicit-def: $sgpr76 @@ -10824,8 +10824,8 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v4, s23 ; GFX9-NEXT: v_mov_b32_e32 v1, s24 ; GFX9-NEXT: v_mov_b32_e32 v2, s25 -; GFX9-NEXT: v_mov_b32_e32 v39, s76 -; GFX9-NEXT: v_mov_b32_e32 v48, s74 +; GFX9-NEXT: v_mov_b32_e32 v48, s76 +; GFX9-NEXT: v_mov_b32_e32 v39, s74 ; GFX9-NEXT: v_mov_b32_e32 v38, s75 ; GFX9-NEXT: v_mov_b32_e32 v36, s73 ; GFX9-NEXT: v_mov_b32_e32 v37, s72 @@ -10855,10 +10855,10 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v12, s10 ; GFX9-NEXT: v_mov_b32_e32 v11, s12 ; GFX9-NEXT: .LBB33_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v48 ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v39, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v38 @@ -12443,6 +12443,41 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v25 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_or_b32_e32 v3, s7, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 ; SI-NEXT: v_or_b32_e32 v0, v0, v38 @@ -12450,10 +12485,17 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v37, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v34 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v0, v0, v36 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v33, v1 ; SI-NEXT: v_or_b32_e32 v6, v0, v1 @@ -12477,49 +12519,7 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v30 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v31 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: v_or_b32_e32 v3, s4, v0 -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s20, 0xff -; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s22, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s24, 0xff -; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s26, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -12676,30 +12676,6 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v25 ; VI-NEXT: s_cbranch_scc0 .LBB35_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -12727,6 +12703,30 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, s7, v0 +; VI-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -12857,30 +12857,6 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v25 ; GFX9-NEXT: s_cbranch_scc0 .LBB35_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s5, s29, 8 -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -12908,6 +12884,30 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_and_b32 s7, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s29, 8 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s7, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 @@ -15054,126 +15054,126 @@ define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v4 -; SI-NEXT: v_mov_b32_e32 v10, v3 -; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: v_mov_b32_e32 v13, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v25 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 -; SI-NEXT: v_or_b32_e32 v14, v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_or_b32_e32 v18, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_or_b32_e32 v14, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v10, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_or_b32_e32 v6, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v2, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[21:22], v[1:2], 16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[5:6], 16 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_lshr_b64 v[24:25], v[9:10], 16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 +; SI-NEXT: v_lshr_b64 v[23:24], v[9:10], 16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v23 -; SI-NEXT: v_mov_b32_e32 v5, v21 -; SI-NEXT: v_mov_b32_e32 v9, v24 -; SI-NEXT: v_mov_b32_e32 v13, v25 -; SI-NEXT: v_mov_b32_e32 v17, v26 +; SI-NEXT: v_mov_b32_e32 v1, v21 +; SI-NEXT: v_mov_b32_e32 v5, v22 +; SI-NEXT: v_mov_b32_e32 v9, v23 +; SI-NEXT: v_mov_b32_e32 v13, v24 +; SI-NEXT: v_mov_b32_e32 v17, v25 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: ; SI-NEXT: s_branch .LBB47_2 @@ -15186,56 +15186,56 @@ define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s5, s24, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x200 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s5, s25, 16 -; VI-NEXT: v_add_f16_e32 v1, s24, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_lshr_b32 s5, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s23, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s22, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s21, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v3, s25, v0 -; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, s20, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s19, 16 -; VI-NEXT: v_or_b32_e32 v9, v3, v4 -; VI-NEXT: v_or_b32_e32 v4, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s19, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_add_f16_e32 v1, s18, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v1, v2 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_add_f16_sdwa v11, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v9, s4 +; VI-NEXT: s_lshr_b32 s4, s25, 16 +; VI-NEXT: v_add_f16_sdwa v19, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v9, s4 ; VI-NEXT: v_add_f16_e32 v10, s16, v0 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s17, v0 -; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_add_f16_sdwa v11, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v12, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v13, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v14, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_add_f16_sdwa v15, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s21, v0 +; VI-NEXT: v_add_f16_sdwa v16, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s22, v0 +; VI-NEXT: v_add_f16_sdwa v17, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s23, v0 +; VI-NEXT: v_add_f16_sdwa v18, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, s24, v0 +; VI-NEXT: v_add_f16_sdwa v9, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s25, v0 +; VI-NEXT: v_or_b32_e32 v9, v0, v9 +; VI-NEXT: v_or_b32_e32 v8, v8, v19 +; VI-NEXT: v_or_b32_e32 v7, v7, v18 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_or_b32_e32 v5, v5, v16 +; VI-NEXT: v_or_b32_e32 v4, v4, v15 +; VI-NEXT: v_or_b32_e32 v3, v3, v14 +; VI-NEXT: v_or_b32_e32 v2, v2, v13 +; VI-NEXT: v_or_b32_e32 v1, v1, v12 ; VI-NEXT: v_or_b32_e32 v0, v10, v11 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_3: @@ -15990,10 +15990,10 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB48_2 @@ -16074,10 +16074,10 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 @@ -16528,22 +16528,15 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v8, s30, 0 ; SI-NEXT: v_writelane_b32 v8, s31, 1 ; SI-NEXT: v_writelane_b32 v8, s34, 2 -; SI-NEXT: v_writelane_b32 v8, s35, 3 -; SI-NEXT: v_writelane_b32 v8, s36, 4 -; SI-NEXT: v_writelane_b32 v8, s37, 5 -; SI-NEXT: v_writelane_b32 v8, s38, 6 -; SI-NEXT: v_writelane_b32 v8, s39, 7 -; SI-NEXT: v_writelane_b32 v8, s48, 8 -; SI-NEXT: v_writelane_b32 v8, s49, 9 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_writelane_b32 v8, s50, 10 -; SI-NEXT: v_readfirstlane_b32 s39, v6 -; SI-NEXT: v_readfirstlane_b32 s48, v5 -; SI-NEXT: v_readfirstlane_b32 s49, v4 -; SI-NEXT: v_readfirstlane_b32 s50, v3 -; SI-NEXT: v_readfirstlane_b32 s35, v2 +; SI-NEXT: v_writelane_b32 v8, s35, 3 +; SI-NEXT: v_readfirstlane_b32 s30, v6 +; SI-NEXT: v_readfirstlane_b32 s31, v5 +; SI-NEXT: v_readfirstlane_b32 s34, v4 +; SI-NEXT: v_readfirstlane_b32 s35, v3 +; SI-NEXT: v_readfirstlane_b32 s94, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s38, v1 +; SI-NEXT: v_readfirstlane_b32 s95, v1 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -16567,62 +16560,62 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s28, 0xffff ; SI-NEXT: s_lshl_b32 s5, s29, 16 ; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: s_and_b32 s4, s38, 0xffff -; SI-NEXT: s_lshl_b32 s5, s35, 16 +; SI-NEXT: s_and_b32 s4, s95, 0xffff +; SI-NEXT: s_lshl_b32 s5, s94, 16 ; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 ; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: s_and_b32 s4, s50, 0xffff -; SI-NEXT: s_lshl_b32 s5, s49, 16 +; SI-NEXT: s_and_b32 s4, s35, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s48, 0xffff -; SI-NEXT: s_lshl_b32 s15, s39, 16 +; SI-NEXT: s_and_b32 s5, s31, 0xffff +; SI-NEXT: s_lshl_b32 s15, s30, 16 +; SI-NEXT: s_or_b32 s5, s5, s15 ; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 8 ; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 24 ; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[10:11], 8 ; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 -; SI-NEXT: s_or_b32 s5, s5, s15 -; SI-NEXT: s_lshr_b32 s34, s13, 8 -; SI-NEXT: s_lshr_b32 s95, s11, 8 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s79, s13, 8 +; SI-NEXT: s_lshr_b32 s73, s11, 8 ; SI-NEXT: s_lshr_b32 s59, s9, 8 ; SI-NEXT: s_lshr_b32 s45, s7, 8 ; SI-NEXT: s_lshr_b32 s15, s5, 8 -; SI-NEXT: s_and_b32 s36, s19, 0xffff -; SI-NEXT: s_and_b32 s30, s23, 0xffff +; SI-NEXT: s_and_b32 s89, s19, 0xffff +; SI-NEXT: s_and_b32 s75, s23, 0xffff ; SI-NEXT: s_and_b32 s61, s27, 0xffff -; SI-NEXT: s_and_b32 s47, s35, 0xffff -; SI-NEXT: s_and_b32 s41, s39, 0xffff -; SI-NEXT: s_bfe_u32 s37, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s31, s23, 0x80008 -; SI-NEXT: s_bfe_u32 s94, s27, 0x80008 -; SI-NEXT: s_bfe_u32 s57, s35, 0x80008 -; SI-NEXT: s_bfe_u32 s43, s39, 0x80008 -; SI-NEXT: s_lshr_b64 s[88:89], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[90:91], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 8 +; SI-NEXT: s_and_b32 s47, s94, 0xffff +; SI-NEXT: s_and_b32 s41, s30, 0xffff +; SI-NEXT: s_bfe_u32 s91, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s77, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s63, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s57, s94, 0x80008 +; SI-NEXT: s_bfe_u32 s43, s30, 0x80008 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: s_add_i32 s50, s50, 3 -; SI-NEXT: s_and_b32 s4, s50, 0xffff -; SI-NEXT: s_lshl_b32 s5, s49, 16 -; SI-NEXT: s_add_i32 s48, s48, 3 +; SI-NEXT: s_add_i32 s35, s35, 3 +; SI-NEXT: s_and_b32 s4, s35, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 16 +; SI-NEXT: s_add_i32 s31, s31, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s48, 0xffff -; SI-NEXT: s_lshl_b32 s6, s39, 16 +; SI-NEXT: s_and_b32 s5, s31, 0xffff +; SI-NEXT: s_lshl_b32 s6, s30, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s28, 0xffff ; SI-NEXT: s_lshl_b32 s7, s29, 16 -; SI-NEXT: s_add_i32 s38, s38, 3 +; SI-NEXT: s_add_i32 s95, s95, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s38, 0xffff -; SI-NEXT: s_lshl_b32 s8, s35, 16 +; SI-NEXT: s_and_b32 s7, s95, 0xffff +; SI-NEXT: s_lshl_b32 s8, s94, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s24, 0xffff @@ -16648,38 +16641,38 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_and_b32 s13, s18, 0xffff ; SI-NEXT: s_lshl_b32 s14, s19, 16 ; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 ; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s12, s12, 0x30000 ; SI-NEXT: s_add_i32 s13, s13, 0x30000 -; SI-NEXT: s_add_i32 s4, s4, 0x30000 -; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 ; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 ; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 8 ; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 24 ; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[10:11], 8 ; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[90:91], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 8 -; SI-NEXT: s_lshr_b32 s37, s13, 24 -; SI-NEXT: s_lshr_b32 s36, s13, 16 -; SI-NEXT: s_lshr_b32 s34, s13, 8 -; SI-NEXT: s_lshr_b32 s31, s11, 24 -; SI-NEXT: s_lshr_b32 s30, s11, 16 -; SI-NEXT: s_lshr_b32 s95, s11, 8 -; SI-NEXT: s_lshr_b32 s94, s9, 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s91, s13, 24 +; SI-NEXT: s_lshr_b32 s89, s13, 16 +; SI-NEXT: s_lshr_b32 s79, s13, 8 +; SI-NEXT: s_lshr_b32 s77, s11, 24 +; SI-NEXT: s_lshr_b32 s75, s11, 16 +; SI-NEXT: s_lshr_b32 s73, s11, 8 +; SI-NEXT: s_lshr_b32 s63, s9, 24 ; SI-NEXT: s_lshr_b32 s61, s9, 16 ; SI-NEXT: s_lshr_b32 s59, s9, 8 ; SI-NEXT: s_lshr_b32 s57, s7, 24 @@ -16700,17 +16693,17 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: v_mov_b32_e32 v1, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xff -; SI-NEXT: s_lshl_b32 s13, s34, 8 +; SI-NEXT: s_lshl_b32 s13, s79, 8 ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: s_and_b32 s13, s36, 0xff +; SI-NEXT: s_and_b32 s13, s89, 0xff ; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: s_lshl_b32 s14, s37, 24 +; SI-NEXT: s_lshl_b32 s14, s91, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s12, s60, 8 +; SI-NEXT: s_lshl_b32 s12, s62, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: s_and_b32 s12, s46, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 16 @@ -16725,11 +16718,11 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: s_lshl_b32 s11, s95, 8 +; SI-NEXT: s_lshl_b32 s11, s73, 8 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_and_b32 s11, s30, 0xff +; SI-NEXT: s_and_b32 s11, s75, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s12, s31, 24 +; SI-NEXT: s_lshl_b32 s12, s77, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 @@ -16738,9 +16731,9 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s10, s88, 8 +; SI-NEXT: s_lshl_b32 s10, s76, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: s_and_b32 s10, s58, 0xff +; SI-NEXT: s_and_b32 s10, s60, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_lshl_b32 s11, s44, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff @@ -16755,7 +16748,7 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s61, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s10, s94, 24 +; SI-NEXT: s_lshl_b32 s10, s63, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 @@ -16764,11 +16757,11 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s8, s74, 8 +; SI-NEXT: s_lshl_b32 s8, s88, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s90, 0xff +; SI-NEXT: s_and_b32 s8, s74, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s78, 24 +; SI-NEXT: s_lshl_b32 s9, s58, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 @@ -16790,11 +16783,11 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s6, s76, 8 +; SI-NEXT: s_lshl_b32 s6, s90, 8 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s72, 0xff +; SI-NEXT: s_and_b32 s6, s78, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s62, 24 +; SI-NEXT: s_lshl_b32 s7, s72, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 @@ -16816,13 +16809,6 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s50, v8, 10 -; SI-NEXT: v_readlane_b32 s49, v8, 9 -; SI-NEXT: v_readlane_b32 s48, v8, 8 -; SI-NEXT: v_readlane_b32 s39, v8, 7 -; SI-NEXT: v_readlane_b32 s38, v8, 6 -; SI-NEXT: v_readlane_b32 s37, v8, 5 -; SI-NEXT: v_readlane_b32 s36, v8, 4 ; SI-NEXT: v_readlane_b32 s35, v8, 3 ; SI-NEXT: v_readlane_b32 s34, v8, 2 ; SI-NEXT: v_readlane_b32 s31, v8, 1 @@ -16837,37 +16823,37 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr91 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr95 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr59 ; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v20i16_to_v40i8_scalar: @@ -17219,8 +17205,8 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v9 ; GFX9-NEXT: s_branch .LBB49_5 ; GFX9-NEXT: .LBB49_3: ; GFX9-NEXT: ; implicit-def: $sgpr76 @@ -17265,8 +17251,8 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v4, s23 ; GFX9-NEXT: v_mov_b32_e32 v1, s24 ; GFX9-NEXT: v_mov_b32_e32 v2, s25 -; GFX9-NEXT: v_mov_b32_e32 v39, s76 -; GFX9-NEXT: v_mov_b32_e32 v48, s74 +; GFX9-NEXT: v_mov_b32_e32 v48, s76 +; GFX9-NEXT: v_mov_b32_e32 v39, s74 ; GFX9-NEXT: v_mov_b32_e32 v38, s75 ; GFX9-NEXT: v_mov_b32_e32 v36, s73 ; GFX9-NEXT: v_mov_b32_e32 v37, s72 @@ -17296,10 +17282,10 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v14, s6 ; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: .LBB49_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v48 ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v39, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v38 @@ -18916,8 +18902,8 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v31, v18 ; SI-NEXT: v_mov_b32_e32 v32, v14 ; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_readfirstlane_b32 s43, v1 -; SI-NEXT: v_readfirstlane_b32 s42, v0 +; SI-NEXT: v_readfirstlane_b32 s41, v1 +; SI-NEXT: v_readfirstlane_b32 s40, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v5 @@ -18933,99 +18919,99 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v25 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_and_b32_e32 v9, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v48, v1, v10 +; SI-NEXT: v_or_b32_e32 v25, v9, v48 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v12 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: v_or_b32_e32 v9, v9, v36 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v50, v35, v10 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: v_or_b32_e32 v27, v9, v50 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v20 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s12, s6, s5 +; SI-NEXT: v_or_b32_e32 v9, v9, v38 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_or_b32 s6, s4, s12 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v52, v37, v10 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_or_b32_e32 v29, v9, v52 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v8 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_or_b32 s14, s7, s5 +; SI-NEXT: v_or_b32_e32 v9, v9, v3 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_or_b32 s42, s7, s5 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v11, v0, v10 ; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: v_and_b32_e32 v10, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v49, v9, v11 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s23, 24 -; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: v_lshr_b64 v[9:10], v[48:49], 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v16 ; SI-NEXT: s_or_b32 s13, s5, s7 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 16 -; SI-NEXT: v_or_b32_e32 v13, v35, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v8 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v15, v5, v13 ; SI-NEXT: s_and_b32 s5, s28, 0xff ; SI-NEXT: s_lshl_b32 s9, s29, 8 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v24 +; SI-NEXT: v_or_b32_e32 v51, v10, v15 ; SI-NEXT: s_or_b32 s5, s5, s9 -; SI-NEXT: s_and_b32 s9, s42, 0xff -; SI-NEXT: v_or_b32_e32 v9, v9, v34 -; SI-NEXT: v_or_b32_e32 v17, v37, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v19, v0, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v32 -; SI-NEXT: v_or_b32_e32 v39, v5, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_and_b32 s9, s40, 0xff +; SI-NEXT: v_lshr_b64 v[13:14], v[50:51], 16 ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s10, s43, 24 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v10, v10, v3 -; SI-NEXT: v_or_b32_e32 v14, v14, v7 -; SI-NEXT: v_or_b32_e32 v18, v18, v23 -; SI-NEXT: v_or_b32_e32 v48, v21, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v31 +; SI-NEXT: s_lshl_b32 s10, s41, 24 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s12, s10, s9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v10, v10, v23 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_or_b32 s43, s5, s12 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v25, v25, v38 -; SI-NEXT: s_or_b32 s15, s5, s12 +; SI-NEXT: v_or_b32_e32 v14, v21, v14 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v9, v1, v9 -; SI-NEXT: v_or_b32_e32 v10, v10, v19 -; SI-NEXT: v_or_b32_e32 v14, v14, v39 -; SI-NEXT: v_or_b32_e32 v18, v18, v48 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v25 -; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 16 -; SI-NEXT: s_or_b32 s4, s4, s14 -; SI-NEXT: v_or_b32_e32 v25, v11, v9 -; SI-NEXT: v_mov_b32_e32 v26, v10 -; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 -; SI-NEXT: v_or_b32_e32 v27, v15, v13 -; SI-NEXT: v_mov_b32_e32 v28, v14 -; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 -; SI-NEXT: v_or_b32_e32 v29, v29, v17 -; SI-NEXT: v_mov_b32_e32 v30, v18 -; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[42:43], 16 +; SI-NEXT: v_or_b32_e32 v53, v10, v14 +; SI-NEXT: s_or_b32 s4, s4, s42 +; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 16 ; SI-NEXT: s_lshr_b32 s9, s7, 16 ; SI-NEXT: s_lshr_b32 s11, s12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 ; SI-NEXT: s_mov_b32 s7, s13 -; SI-NEXT: s_mov_b32 s5, s15 +; SI-NEXT: s_mov_b32 s5, s43 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v30, v53 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v31 @@ -19066,18 +19052,18 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s28, 0xff ; SI-NEXT: s_lshl_b32 s6, s29, 8 -; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s42, 0xff +; SI-NEXT: s_and_b32 s7, s40, 0xff ; SI-NEXT: v_or_b32_e32 v9, v36, v9 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_lshl_b32 s6, s43, 24 +; SI-NEXT: s_lshl_b32 s6, s41, 24 ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 @@ -19194,14 +19180,14 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: v_mov_b32_e32 v31, v14 -; VI-NEXT: v_mov_b32_e32 v27, v12 -; VI-NEXT: v_mov_b32_e32 v32, v10 -; VI-NEXT: v_mov_b32_e32 v29, v8 -; VI-NEXT: v_mov_b32_e32 v33, v6 -; VI-NEXT: v_mov_b32_e32 v30, v4 -; VI-NEXT: v_mov_b32_e32 v34, v2 -; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v32, v12 +; VI-NEXT: v_mov_b32_e32 v34, v10 +; VI-NEXT: v_mov_b32_e32 v31, v8 +; VI-NEXT: v_mov_b32_e32 v28, v6 +; VI-NEXT: v_mov_b32_e32 v27, v4 +; VI-NEXT: v_mov_b32_e32 v29, v2 +; VI-NEXT: v_mov_b32_e32 v30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v3 @@ -19249,18 +19235,18 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s8, s29, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v28, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_or_b32_sdwa v0, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v30, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v27, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v33, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v29, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v28, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v27, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v33, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -19279,16 +19265,16 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 ; VI-NEXT: v_or_b32_sdwa v7, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 ; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: v_or_b32_sdwa v6, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v34 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s26, 0xff ; VI-NEXT: s_lshl_b32 s6, s27, 8 @@ -19299,13 +19285,13 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s22, 0xff ; VI-NEXT: s_lshl_b32 s8, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 ; VI-NEXT: v_or_b32_sdwa v5, v38, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 ; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s8, s20, 0xff ; VI-NEXT: s_lshl_b32 s9, s21, 8 @@ -19316,12 +19302,12 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s10, s19, 8 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v27 ; VI-NEXT: s_or_b32 s9, s10, s9 ; VI-NEXT: s_and_b32 s10, s16, 0xff ; VI-NEXT: s_lshl_b32 s11, s17, 8 ; VI-NEXT: v_or_b32_sdwa v4, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v29 ; VI-NEXT: s_or_b32 s10, s11, s10 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 ; VI-NEXT: v_or_b32_sdwa v3, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -19332,7 +19318,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v20 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -19379,12 +19365,12 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; GFX9-NEXT: v_mov_b32_e32 v29, v14 -; GFX9-NEXT: v_mov_b32_e32 v33, v12 -; GFX9-NEXT: v_mov_b32_e32 v30, v10 -; GFX9-NEXT: v_mov_b32_e32 v27, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v12 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v31, v8 ; GFX9-NEXT: v_mov_b32_e32 v28, v6 -; GFX9-NEXT: v_mov_b32_e32 v34, v4 -; GFX9-NEXT: v_mov_b32_e32 v31, v2 +; GFX9-NEXT: v_mov_b32_e32 v30, v4 +; GFX9-NEXT: v_mov_b32_e32 v27, v2 ; GFX9-NEXT: v_mov_b32_e32 v32, v0 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v1 @@ -19430,16 +19416,16 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, s7, v1 ; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -19468,10 +19454,10 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 ; GFX9-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 ; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v28 @@ -19481,7 +19467,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v31 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_and_b32 s5, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s25, 8 @@ -19492,7 +19478,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: s_lshl_b32 s7, s27, 8 ; GFX9-NEXT: s_add_i32 s20, s20, 3 ; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v27 ; GFX9-NEXT: s_or_b32 s6, s7, s6 ; GFX9-NEXT: s_and_b32 s7, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s21, 8 @@ -19503,7 +19489,7 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: s_lshl_b32 s9, s23, 8 ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 ; GFX9-NEXT: s_or_b32 s8, s9, s8 ; GFX9-NEXT: s_and_b32 s9, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s17, 8 @@ -20255,11 +20241,11 @@ define inreg <5 x double> @bitcast_v20i16_to_v5f64_scalar(<20 x i16> inreg %a, i ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 ; SI-NEXT: v_or_b32_e32 v7, v0, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 +; SI-NEXT: v_or_b32_e32 v8, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v9, v0, v19 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -20501,16 +20487,18 @@ define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v5f64_to_v20i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v28, v9 -; SI-NEXT: v_mov_b32_e32 v27, v8 -; SI-NEXT: v_mov_b32_e32 v26, v7 -; SI-NEXT: v_mov_b32_e32 v25, v6 -; SI-NEXT: v_mov_b32_e32 v24, v5 -; SI-NEXT: v_mov_b32_e32 v23, v4 -; SI-NEXT: v_mov_b32_e32 v22, v3 -; SI-NEXT: v_mov_b32_e32 v21, v2 +; SI-NEXT: v_mov_b32_e32 v21, v9 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v23, v7 +; SI-NEXT: v_mov_b32_e32 v22, v6 +; SI-NEXT: v_mov_b32_e32 v25, v5 +; SI-NEXT: v_mov_b32_e32 v24, v4 +; SI-NEXT: v_mov_b32_e32 v27, v3 +; SI-NEXT: v_mov_b32_e32 v26, v2 +; SI-NEXT: v_mov_b32_e32 v29, v1 +; SI-NEXT: v_mov_b32_e32 v28, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 @@ -20524,47 +20512,47 @@ define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v17, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v13, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v9, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v5, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_alignbit_b32 v17, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v13, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v9, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v5, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v1, v29, v28, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 ; SI-NEXT: .LBB54_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_alignbit_b32 v17, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v13, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v9, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v5, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v20, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; SI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; SI-NEXT: v_alignbit_b32 v17, v21, v20, 16 +; SI-NEXT: v_alignbit_b32 v13, v23, v22, 16 +; SI-NEXT: v_alignbit_b32 v9, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v5, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v1, v29, v28, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 ; SI-NEXT: .LBB54_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v4, v21 -; SI-NEXT: v_mov_b32_e32 v6, v22 -; SI-NEXT: v_mov_b32_e32 v8, v23 -; SI-NEXT: v_mov_b32_e32 v10, v24 -; SI-NEXT: v_mov_b32_e32 v12, v25 -; SI-NEXT: v_mov_b32_e32 v14, v26 -; SI-NEXT: v_mov_b32_e32 v16, v27 -; SI-NEXT: v_mov_b32_e32 v18, v28 -; SI-NEXT: v_mov_b32_e32 v1, v20 +; SI-NEXT: v_mov_b32_e32 v0, v28 +; SI-NEXT: v_mov_b32_e32 v2, v29 +; SI-NEXT: v_mov_b32_e32 v4, v26 +; SI-NEXT: v_mov_b32_e32 v6, v27 +; SI-NEXT: v_mov_b32_e32 v8, v24 +; SI-NEXT: v_mov_b32_e32 v10, v25 +; SI-NEXT: v_mov_b32_e32 v12, v22 +; SI-NEXT: v_mov_b32_e32 v14, v23 +; SI-NEXT: v_mov_b32_e32 v16, v20 +; SI-NEXT: v_mov_b32_e32 v18, v21 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v20i16: @@ -21101,11 +21089,11 @@ define inreg <5 x i64> @bitcast_v20i16_to_v5i64_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 ; SI-NEXT: v_or_b32_e32 v7, v0, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 +; SI-NEXT: v_or_b32_e32 v8, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v9, v0, v19 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -22092,9 +22080,9 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB60_2 @@ -22186,10 +22174,10 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; VI-NEXT: v_bfe_u32 v48, v23, 8, 8 ; VI-NEXT: .LBB60_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 @@ -22288,10 +22276,10 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB60_2 @@ -22373,10 +22361,10 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX9-NEXT: .LBB60_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 @@ -22835,11 +22823,11 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v21, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB61_4 @@ -22870,20 +22858,20 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_or_b32 s9, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s4, v21 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v5 +; SI-NEXT: v_readfirstlane_b32 s5, v4 ; SI-NEXT: s_or_b32 s6, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v20 ; SI-NEXT: s_or_b32 s7, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v24 +; SI-NEXT: v_readfirstlane_b32 s4, v23 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v23 +; SI-NEXT: v_readfirstlane_b32 s5, v22 ; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_readfirstlane_b32 s15, v22 +; SI-NEXT: v_readfirstlane_b32 s15, v6 ; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 8 ; SI-NEXT: s_lshr_b64 s[18:19], s[10:11], 24 @@ -22892,11 +22880,11 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 ; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 ; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[58:59], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 8 ; SI-NEXT: s_lshr_b32 s23, s13, 8 @@ -22905,46 +22893,46 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s17, s7, 8 ; SI-NEXT: s_lshr_b32 s15, s5, 8 ; SI-NEXT: v_bfe_u32 v25, v10, 8, 8 -; SI-NEXT: v_bfe_u32 v7, v9, 8, 8 -; SI-NEXT: v_bfe_u32 v6, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v4, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v24, v9, 8, 8 +; SI-NEXT: v_bfe_u32 v7, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v5, v2, 8, 8 ; SI-NEXT: v_bfe_u32 v3, v1, 8, 8 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_readfirstlane_b32 s4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 -; SI-NEXT: v_readfirstlane_b32 s5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_readfirstlane_b32 s5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s6, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s5, v1 -; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readfirstlane_b32 s6, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: v_readfirstlane_b32 s7, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_readfirstlane_b32 s6, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 @@ -23019,11 +23007,11 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 ; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 ; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[58:59], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 8 ; SI-NEXT: s_lshr_b32 s23, s13, 8 @@ -23032,9 +23020,9 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s17, s7, 8 ; SI-NEXT: s_lshr_b32 s15, s5, 8 ; SI-NEXT: v_bfe_u32 v25, v10, 8, 8 -; SI-NEXT: v_bfe_u32 v7, v9, 8, 8 -; SI-NEXT: v_bfe_u32 v6, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v4, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v24, v9, 8, 8 +; SI-NEXT: v_bfe_u32 v7, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v5, v2, 8, 8 ; SI-NEXT: v_bfe_u32 v3, v1, 8, 8 ; SI-NEXT: .LBB61_3: ; %end ; SI-NEXT: s_and_b32 s12, s12, 0xff @@ -23046,18 +23034,18 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: v_mov_b32_e32 v5, s12 -; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s12, s13, 0xff ; SI-NEXT: s_lshl_b32 s13, s23, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v10 ; SI-NEXT: s_or_b32 s12, s12, s13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v25 ; SI-NEXT: s_and_b32 s12, s12, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: v_or_b32_e32 v5, s12, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v4, s12, v4 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s12, s26, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 @@ -23066,64 +23054,69 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s13, s18, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 ; SI-NEXT: s_or_b32 s10, s10, s12 -; SI-NEXT: buffer_store_dword v5, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v10, s10 -; SI-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s10, s11, 0xff ; SI-NEXT: s_lshl_b32 s11, s21, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v9 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_or_b32_e32 v5, s10, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v4, s10, v4 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s10, s40, 8 +; SI-NEXT: s_lshl_b32 s10, s42, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_lshl_b32 s11, s24, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 12, v0 ; SI-NEXT: s_or_b32 s8, s8, s10 -; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 -; SI-NEXT: v_mov_b32_e32 v7, s8 -; SI-NEXT: buffer_store_dword v7, v5, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen ; SI-NEXT: s_and_b32 s8, s9, 0xff ; SI-NEXT: s_lshl_b32 s9, s19, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 ; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v7 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v5, s8, v5 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_or_b32_e32 v4, s8, v4 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s8, s44, 8 +; SI-NEXT: s_lshl_b32 s8, s56, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s42, 0xff +; SI-NEXT: s_and_b32 s8, s44, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s46, 24 +; SI-NEXT: s_lshl_b32 s9, s40, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 ; SI-NEXT: v_mov_b32_e32 v6, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff ; SI-NEXT: s_lshl_b32 s7, s17, 8 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_or_b32_e32 v2, s6, v2 @@ -23132,14 +23125,11 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s58, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s56, 24 +; SI-NEXT: s_lshl_b32 s7, s46, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 ; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 @@ -23164,31 +23154,31 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v20f16_to_v40i8_scalar: @@ -23524,8 +23514,8 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v9 ; GFX9-NEXT: s_branch .LBB61_5 ; GFX9-NEXT: .LBB61_3: ; GFX9-NEXT: ; implicit-def: $sgpr76 @@ -23570,8 +23560,8 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v4, s23 ; GFX9-NEXT: v_mov_b32_e32 v1, s24 ; GFX9-NEXT: v_mov_b32_e32 v2, s25 -; GFX9-NEXT: v_mov_b32_e32 v39, s76 -; GFX9-NEXT: v_mov_b32_e32 v48, s74 +; GFX9-NEXT: v_mov_b32_e32 v48, s76 +; GFX9-NEXT: v_mov_b32_e32 v39, s74 ; GFX9-NEXT: v_mov_b32_e32 v38, s75 ; GFX9-NEXT: v_mov_b32_e32 v36, s73 ; GFX9-NEXT: v_mov_b32_e32 v37, s72 @@ -23601,10 +23591,10 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v14, s6 ; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: .LBB61_5: ; %end +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 8, v48 ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v39, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v38 @@ -25434,14 +25424,14 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; VI-NEXT: v_mov_b32_e32 v31, v14 -; VI-NEXT: v_mov_b32_e32 v27, v12 -; VI-NEXT: v_mov_b32_e32 v32, v10 -; VI-NEXT: v_mov_b32_e32 v29, v8 -; VI-NEXT: v_mov_b32_e32 v33, v6 -; VI-NEXT: v_mov_b32_e32 v30, v4 -; VI-NEXT: v_mov_b32_e32 v34, v2 -; VI-NEXT: v_mov_b32_e32 v28, v0 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v32, v12 +; VI-NEXT: v_mov_b32_e32 v34, v10 +; VI-NEXT: v_mov_b32_e32 v31, v8 +; VI-NEXT: v_mov_b32_e32 v28, v6 +; VI-NEXT: v_mov_b32_e32 v27, v4 +; VI-NEXT: v_mov_b32_e32 v29, v2 +; VI-NEXT: v_mov_b32_e32 v30, v0 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v3 @@ -25489,18 +25479,18 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s8, s29, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_and_b32 s7, s7, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v28, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v30, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s7, v0 -; VI-NEXT: v_or_b32_sdwa v0, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v30, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v29, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v27, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v33, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v29, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v28, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v27, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v33, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -25519,16 +25509,16 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 ; VI-NEXT: v_or_b32_sdwa v7, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 ; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: v_or_b32_sdwa v6, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v34 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s26, 0xff ; VI-NEXT: s_lshl_b32 s6, s27, 8 @@ -25539,13 +25529,13 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s22, 0xff ; VI-NEXT: s_lshl_b32 s8, s23, 8 ; VI-NEXT: s_add_i32 s20, s20, 3 ; VI-NEXT: v_or_b32_sdwa v5, v38, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 ; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s8, s20, 0xff ; VI-NEXT: s_lshl_b32 s9, s21, 8 @@ -25556,12 +25546,12 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s10, s19, 8 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v27 ; VI-NEXT: s_or_b32 s9, s10, s9 ; VI-NEXT: s_and_b32 s10, s16, 0xff ; VI-NEXT: s_lshl_b32 s11, s17, 8 ; VI-NEXT: v_or_b32_sdwa v4, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v29 ; VI-NEXT: s_or_b32 s10, s11, s10 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 ; VI-NEXT: v_or_b32_sdwa v3, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -25572,7 +25562,7 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v20 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -25619,12 +25609,12 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; GFX9-NEXT: v_mov_b32_e32 v29, v14 -; GFX9-NEXT: v_mov_b32_e32 v33, v12 -; GFX9-NEXT: v_mov_b32_e32 v30, v10 -; GFX9-NEXT: v_mov_b32_e32 v27, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v12 +; GFX9-NEXT: v_mov_b32_e32 v33, v10 +; GFX9-NEXT: v_mov_b32_e32 v31, v8 ; GFX9-NEXT: v_mov_b32_e32 v28, v6 -; GFX9-NEXT: v_mov_b32_e32 v34, v4 -; GFX9-NEXT: v_mov_b32_e32 v31, v2 +; GFX9-NEXT: v_mov_b32_e32 v30, v4 +; GFX9-NEXT: v_mov_b32_e32 v27, v2 ; GFX9-NEXT: v_mov_b32_e32 v32, v0 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v1 @@ -25670,16 +25660,16 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, s7, v1 ; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v28, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v29, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -25708,10 +25698,10 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 ; GFX9-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 ; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v28 @@ -25721,7 +25711,7 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v31 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_and_b32 s5, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s25, 8 @@ -25732,7 +25722,7 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: s_lshl_b32 s7, s27, 8 ; GFX9-NEXT: s_add_i32 s20, s20, 3 ; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v27 ; GFX9-NEXT: s_or_b32 s6, s7, s6 ; GFX9-NEXT: s_and_b32 s7, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s21, 8 @@ -25743,7 +25733,7 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; GFX9-NEXT: s_lshl_b32 s9, s23, 8 ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 ; GFX9-NEXT: s_or_b32 s8, s9, s8 ; GFX9-NEXT: s_and_b32 s9, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s17, 8 @@ -26312,41 +26302,41 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB64_2 ; SI-NEXT: .LBB64_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 @@ -26553,41 +26543,41 @@ define inreg <5 x double> @bitcast_v20f16_to_v5f64_scalar(<20 x half> inreg %a, ; SI-NEXT: s_cbranch_execnz .LBB65_3 ; SI-NEXT: .LBB65_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 @@ -27047,22 +27037,22 @@ define inreg <20 x half> @bitcast_v5f64_to_v20f16_scalar(<5 x double> inreg %a, ; SI-NEXT: v_add_f64 v[6:7], s[22:23], 1.0 ; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -27071,10 +27061,10 @@ define inreg <20 x half> @bitcast_v5f64_to_v20f16_scalar(<5 x double> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] @@ -27300,41 +27290,41 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB68_2 ; SI-NEXT: .LBB68_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v29 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 @@ -27541,41 +27531,41 @@ define inreg <5 x i64> @bitcast_v20f16_to_v5i64_scalar(<20 x half> inreg %a, i32 ; SI-NEXT: s_cbranch_execnz .LBB69_3 ; SI-NEXT: .LBB69_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v26 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 @@ -30637,10 +30627,10 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB74_2 @@ -30716,10 +30706,10 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; VI-NEXT: .LBB74_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 @@ -30818,10 +30808,10 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB74_2 @@ -30897,10 +30887,10 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX9-NEXT: .LBB74_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 @@ -31352,10 +31342,10 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s92, s17, 24 ; SI-NEXT: s_lshr_b32 s93, s17, 16 ; SI-NEXT: s_lshr_b32 s94, s17, 8 -; SI-NEXT: s_lshr_b64 s[10:11], s[24:25], 24 -; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 8 -; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 ; SI-NEXT: s_lshr_b64 s[12:13], s[22:23], 16 ; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 8 ; SI-NEXT: s_lshr_b64 s[26:27], s[20:21], 24 @@ -31371,29 +31361,29 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; SI-NEXT: .LBB75_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 ; SI-NEXT: v_add_f64 v[15:16], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[21:22], v[8:9], 16 ; SI-NEXT: v_lshr_b64 v[22:23], v[8:9], 8 ; SI-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 ; SI-NEXT: v_lshr_b64 v[23:24], v[15:16], 24 ; SI-NEXT: v_add_f64 v[3:4], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[20:21], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[19:20], s[16:17], 1.0 ; SI-NEXT: v_lshr_b64 v[24:25], v[15:16], 16 ; SI-NEXT: v_lshr_b64 v[10:11], v[1:2], 8 ; SI-NEXT: v_lshr_b64 v[25:26], v[15:16], 8 ; SI-NEXT: v_lshr_b64 v[11:12], v[3:4], 24 -; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 24 +; SI-NEXT: v_lshr_b64 v[26:27], v[19:20], 24 ; SI-NEXT: v_lshr_b64 v[5:6], v[1:2], 24 ; SI-NEXT: v_lshr_b64 v[12:13], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[8:9], 24 -; SI-NEXT: v_lshr_b64 v[27:28], v[20:21], 16 -; SI-NEXT: v_readfirstlane_b32 s17, v21 +; SI-NEXT: v_lshr_b64 v[27:28], v[19:20], 16 +; SI-NEXT: v_readfirstlane_b32 s17, v20 ; SI-NEXT: v_readfirstlane_b32 s19, v16 ; SI-NEXT: v_readfirstlane_b32 s21, v9 ; SI-NEXT: v_readfirstlane_b32 s23, v4 ; SI-NEXT: v_readfirstlane_b32 s25, v2 ; SI-NEXT: v_lshr_b64 v[6:7], v[1:2], 16 ; SI-NEXT: v_lshr_b64 v[13:14], v[3:4], 8 -; SI-NEXT: v_lshr_b64 v[18:19], v[8:9], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[20:21], 8 +; SI-NEXT: v_lshr_b64 v[17:18], v[8:9], 24 +; SI-NEXT: v_lshr_b64 v[28:29], v[19:20], 8 ; SI-NEXT: s_lshr_b32 s72, s25, 24 ; SI-NEXT: s_lshr_b32 s73, s25, 16 ; SI-NEXT: s_lshr_b32 s74, s25, 8 @@ -31431,19 +31421,19 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: s_branch .LBB75_2 ; SI-NEXT: .LBB75_4: -; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v19, s16 ; SI-NEXT: v_mov_b32_e32 v15, s18 ; SI-NEXT: v_mov_b32_e32 v8, s20 ; SI-NEXT: v_mov_b32_e32 v3, s22 @@ -31455,16 +31445,16 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v24, s44 ; SI-NEXT: v_mov_b32_e32 v23, s42 ; SI-NEXT: v_mov_b32_e32 v22, s40 -; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v21, s28 ; SI-NEXT: v_mov_b32_e32 v17, s26 ; SI-NEXT: v_mov_b32_e32 v13, s14 ; SI-NEXT: v_mov_b32_e32 v12, s12 -; SI-NEXT: v_mov_b32_e32 v11, s8 -; SI-NEXT: v_mov_b32_e32 v10, s6 -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v10, s8 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: .LBB75_5: ; %end -; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v28 ; SI-NEXT: s_and_b32 s4, s17, 0xff ; SI-NEXT: s_lshl_b32 s5, s94, 8 @@ -31472,12 +31462,12 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v27 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s93, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v26 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v26 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s92, 24 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -31496,12 +31486,12 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v24 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s90, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v23 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s89, 24 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -31518,11 +31508,11 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s21, 0xff ; SI-NEXT: s_lshl_b32 s5, s88, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v21 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s79, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s78, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -31628,46 +31618,46 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB75_4 ; VI-NEXT: .LBB75_2: ; %cmp.true -; VI-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 +; VI-NEXT: v_add_f64 v[7:8], s[20:21], 1.0 +; VI-NEXT: v_add_f64 v[11:12], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 ; VI-NEXT: v_add_f64 v[3:4], s[22:23], 1.0 -; VI-NEXT: v_add_f64 v[5:6], s[20:21], 1.0 -; VI-NEXT: v_add_f64 v[7:8], s[18:19], 1.0 -; VI-NEXT: v_add_f64 v[9:10], s[16:17], 1.0 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] -; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] -; VI-NEXT: v_readfirstlane_b32 s17, v10 -; VI-NEXT: v_readfirstlane_b32 s19, v8 -; VI-NEXT: v_readfirstlane_b32 s21, v6 +; VI-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; VI-NEXT: v_readfirstlane_b32 s17, v14 +; VI-NEXT: v_readfirstlane_b32 s19, v12 +; VI-NEXT: v_readfirstlane_b32 s21, v8 ; VI-NEXT: v_readfirstlane_b32 s23, v4 ; VI-NEXT: v_readfirstlane_b32 s25, v2 -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] ; VI-NEXT: s_lshr_b32 s26, s25, 24 ; VI-NEXT: s_lshr_b32 s27, s25, 16 ; VI-NEXT: s_lshr_b32 s28, s25, 8 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; VI-NEXT: s_lshr_b32 s29, s23, 24 ; VI-NEXT: s_lshr_b32 s40, s23, 16 ; VI-NEXT: s_lshr_b32 s41, s23, 8 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; VI-NEXT: s_lshr_b32 s42, s21, 24 ; VI-NEXT: s_lshr_b32 s43, s21, 16 ; VI-NEXT: s_lshr_b32 s44, s21, 8 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v7 ; VI-NEXT: s_lshr_b32 s45, s19, 24 ; VI-NEXT: s_lshr_b32 s46, s19, 16 ; VI-NEXT: s_lshr_b32 s47, s19, 8 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v11 ; VI-NEXT: s_lshr_b32 s56, s17, 24 ; VI-NEXT: s_lshr_b32 s57, s17, 16 ; VI-NEXT: s_lshr_b32 s58, s17, 8 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v13 ; VI-NEXT: s_branch .LBB75_5 ; VI-NEXT: .LBB75_3: ; VI-NEXT: ; implicit-def: $sgpr75 @@ -31702,112 +31692,112 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr26 ; VI-NEXT: s_branch .LBB75_2 ; VI-NEXT: .LBB75_4: -; VI-NEXT: v_mov_b32_e32 v9, s16 -; VI-NEXT: v_mov_b32_e32 v7, s18 -; VI-NEXT: v_mov_b32_e32 v5, s20 +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: v_mov_b32_e32 v11, s18 +; VI-NEXT: v_mov_b32_e32 v7, s20 ; VI-NEXT: v_mov_b32_e32 v3, s22 ; VI-NEXT: v_mov_b32_e32 v1, s24 -; VI-NEXT: v_mov_b32_e32 v25, s76 -; VI-NEXT: v_mov_b32_e32 v26, s75 -; VI-NEXT: v_mov_b32_e32 v23, s74 -; VI-NEXT: v_mov_b32_e32 v24, s73 -; VI-NEXT: v_mov_b32_e32 v21, s72 -; VI-NEXT: v_mov_b32_e32 v22, s63 -; VI-NEXT: v_mov_b32_e32 v19, s62 -; VI-NEXT: v_mov_b32_e32 v20, s61 -; VI-NEXT: v_mov_b32_e32 v17, s60 -; VI-NEXT: v_mov_b32_e32 v18, s59 -; VI-NEXT: v_mov_b32_e32 v15, s4 -; VI-NEXT: v_mov_b32_e32 v14, s6 -; VI-NEXT: v_mov_b32_e32 v13, s8 -; VI-NEXT: v_mov_b32_e32 v12, s10 -; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: v_mov_b32_e32 v17, s4 +; VI-NEXT: v_mov_b32_e32 v16, s6 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v9, s10 +; VI-NEXT: v_mov_b32_e32 v5, s12 +; VI-NEXT: v_mov_b32_e32 v19, s76 +; VI-NEXT: v_mov_b32_e32 v20, s75 +; VI-NEXT: v_mov_b32_e32 v14, s74 +; VI-NEXT: v_mov_b32_e32 v18, s73 +; VI-NEXT: v_mov_b32_e32 v10, s72 +; VI-NEXT: v_mov_b32_e32 v12, s63 +; VI-NEXT: v_mov_b32_e32 v6, s62 +; VI-NEXT: v_mov_b32_e32 v8, s61 +; VI-NEXT: v_mov_b32_e32 v2, s60 +; VI-NEXT: v_mov_b32_e32 v4, s59 ; VI-NEXT: .LBB75_5: ; %end ; VI-NEXT: s_and_b32 s4, s17, 0xff ; VI-NEXT: s_lshl_b32 s5, s58, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s57, 0xff ; VI-NEXT: s_lshl_b32 s6, s56, 8 +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v15 +; VI-NEXT: v_or_b32_sdwa v13, v13, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v25, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v13, vcc, 4, v0 +; VI-NEXT: v_mov_b32_e32 v17, s4 ; VI-NEXT: s_and_b32 s4, s19, 0xff ; VI-NEXT: s_lshl_b32 s5, s47, 8 -; VI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v17, v13, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v18 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s46, 0xff ; VI-NEXT: s_lshl_b32 s6, s45, 8 -; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 +; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v16 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v23, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v13, vcc, 8, v0 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v11, vcc, 12, v0 +; VI-NEXT: v_mov_b32_e32 v13, s4 ; VI-NEXT: s_and_b32 s4, s21, 0xff ; VI-NEXT: s_lshl_b32 s5, s44, 8 -; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v13, v11, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v12 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s43, 0xff ; VI-NEXT: s_lshl_b32 s6, s42, 8 -; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v13 +; VI-NEXT: v_or_b32_sdwa v7, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v15 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: v_or_b32_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v0 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v7, vcc, 20, v0 +; VI-NEXT: v_mov_b32_e32 v10, s4 ; VI-NEXT: s_and_b32 s4, s23, 0xff ; VI-NEXT: s_lshl_b32 s5, s41, 8 -; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 +; VI-NEXT: buffer_store_dword v10, v7, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v8 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s40, 0xff ; VI-NEXT: s_lshl_b32 s6, s29, 8 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; VI-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v9 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v3, vcc, 24, v0 +; VI-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 24, v0 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v3, vcc, 28, v0 +; VI-NEXT: v_mov_b32_e32 v6, s4 ; VI-NEXT: s_and_b32 s4, s25, 0xff ; VI-NEXT: s_lshl_b32 s5, s28, 8 -; VI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; VI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s27, 0xff ; VI-NEXT: s_lshl_b32 s6, s26, 8 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -31858,46 +31848,46 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB75_4 ; GFX9-NEXT: .LBB75_2: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[7:8], s[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], s[16:17], 1.0 ; GFX9-NEXT: v_add_f64 v[3:4], s[22:23], 1.0 -; GFX9-NEXT: v_add_f64 v[5:6], s[20:21], 1.0 -; GFX9-NEXT: v_add_f64 v[7:8], s[18:19], 1.0 -; GFX9-NEXT: v_add_f64 v[9:10], s[16:17], 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[7:8] -; GFX9-NEXT: v_readfirstlane_b32 s17, v10 -; GFX9-NEXT: v_readfirstlane_b32 s19, v8 -; GFX9-NEXT: v_readfirstlane_b32 s21, v6 +; GFX9-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], 24, v[11:12] +; GFX9-NEXT: v_readfirstlane_b32 s17, v14 +; GFX9-NEXT: v_readfirstlane_b32 s19, v12 +; GFX9-NEXT: v_readfirstlane_b32 s21, v8 ; GFX9-NEXT: v_readfirstlane_b32 s23, v4 ; GFX9-NEXT: v_readfirstlane_b32 s25, v2 -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[5:6], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[9:10], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] ; GFX9-NEXT: s_lshr_b32 s26, s25, 24 ; GFX9-NEXT: s_lshr_b32 s27, s25, 16 ; GFX9-NEXT: s_lshr_b32 s28, s25, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX9-NEXT: s_lshr_b32 s29, s23, 24 ; GFX9-NEXT: s_lshr_b32 s40, s23, 16 ; GFX9-NEXT: s_lshr_b32 s41, s23, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX9-NEXT: s_lshr_b32 s42, s21, 24 ; GFX9-NEXT: s_lshr_b32 s43, s21, 16 ; GFX9-NEXT: s_lshr_b32 s44, s21, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v7 ; GFX9-NEXT: s_lshr_b32 s45, s19, 24 ; GFX9-NEXT: s_lshr_b32 s46, s19, 16 ; GFX9-NEXT: s_lshr_b32 s47, s19, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 8, v11 ; GFX9-NEXT: s_lshr_b32 s56, s17, 24 ; GFX9-NEXT: s_lshr_b32 s57, s17, 16 ; GFX9-NEXT: s_lshr_b32 s58, s17, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v13 ; GFX9-NEXT: s_branch .LBB75_5 ; GFX9-NEXT: .LBB75_3: ; GFX9-NEXT: ; implicit-def: $sgpr75 @@ -31932,105 +31922,105 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr26 ; GFX9-NEXT: s_branch .LBB75_2 ; GFX9-NEXT: .LBB75_4: -; GFX9-NEXT: v_mov_b32_e32 v9, s16 -; GFX9-NEXT: v_mov_b32_e32 v7, s18 -; GFX9-NEXT: v_mov_b32_e32 v5, s20 +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v11, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s20 ; GFX9-NEXT: v_mov_b32_e32 v3, s22 ; GFX9-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-NEXT: v_mov_b32_e32 v25, s76 -; GFX9-NEXT: v_mov_b32_e32 v26, s75 -; GFX9-NEXT: v_mov_b32_e32 v23, s74 -; GFX9-NEXT: v_mov_b32_e32 v24, s73 -; GFX9-NEXT: v_mov_b32_e32 v21, s72 -; GFX9-NEXT: v_mov_b32_e32 v22, s63 -; GFX9-NEXT: v_mov_b32_e32 v19, s62 -; GFX9-NEXT: v_mov_b32_e32 v20, s61 -; GFX9-NEXT: v_mov_b32_e32 v17, s60 -; GFX9-NEXT: v_mov_b32_e32 v18, s59 -; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_mov_b32_e32 v14, s6 -; GFX9-NEXT: v_mov_b32_e32 v13, s8 -; GFX9-NEXT: v_mov_b32_e32 v12, s10 -; GFX9-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-NEXT: v_mov_b32_e32 v16, s6 +; GFX9-NEXT: v_mov_b32_e32 v15, s8 +; GFX9-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: v_mov_b32_e32 v19, s76 +; GFX9-NEXT: v_mov_b32_e32 v20, s75 +; GFX9-NEXT: v_mov_b32_e32 v14, s74 +; GFX9-NEXT: v_mov_b32_e32 v18, s73 +; GFX9-NEXT: v_mov_b32_e32 v10, s72 +; GFX9-NEXT: v_mov_b32_e32 v12, s63 +; GFX9-NEXT: v_mov_b32_e32 v6, s62 +; GFX9-NEXT: v_mov_b32_e32 v8, s61 +; GFX9-NEXT: v_mov_b32_e32 v2, s60 +; GFX9-NEXT: v_mov_b32_e32 v4, s59 ; GFX9-NEXT: .LBB75_5: ; %end ; GFX9-NEXT: s_and_b32 s4, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s58, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s57, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s56, 8 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v13, s4 ; GFX9-NEXT: s_and_b32 s4, s19, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s47, 8 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v18 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s46, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s45, 8 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v24 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v16 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 ; GFX9-NEXT: s_and_b32 s4, s21, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s44, 8 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v12 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s43, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s42, 8 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v15 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 ; GFX9-NEXT: s_and_b32 s4, s23, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s41, 8 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v20 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s40, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s29, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v9 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_and_b32 s4, s25, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s28, 8 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s27, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s26, 8 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -34686,10 +34676,10 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr11 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB78_2 @@ -34770,10 +34760,10 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; VI-NEXT: .LBB78_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 @@ -34872,10 +34862,10 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr11 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB78_2 @@ -34956,10 +34946,10 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX9-NEXT: .LBB78_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 @@ -35485,14 +35475,14 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in ; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 8 ; SI-NEXT: .LBB79_3: ; %end -; SI-NEXT: s_lshl_b32 s5, s60, 8 -; SI-NEXT: s_and_b32 s7, s16, 0xff -; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_and_b32 s5, s16, 0xff +; SI-NEXT: s_lshl_b32 s7, s60, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: s_and_b32 s7, s58, 0xff -; SI-NEXT: s_lshl_b32 s9, s56, 24 ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: s_lshl_b32 s9, s56, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_and_b32 s5, s17, 0xff @@ -35501,13 +35491,13 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in ; SI-NEXT: s_and_b32 s7, s93, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s9, s92, 24 -; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: v_mov_b32_e32 v2, s5 -; SI-NEXT: s_lshl_b32 s5, s46, 8 -; SI-NEXT: s_and_b32 s7, s18, 0xff -; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s7, s46, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: s_and_b32 s7, s44, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s9, s42, 24 @@ -35641,10 +35631,10 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in ; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB79_2 ; ; VI-LABEL: bitcast_v5i64_to_v40i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll index bd8c305606364..3145da1973e65 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll @@ -5299,136 +5299,138 @@ define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v7 -; SI-NEXT: v_mov_b32_e32 v10, v4 -; SI-NEXT: v_mov_b32_e32 v13, v3 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_mov_b32_e32 v17, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v26 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v23 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_or_b32_e32 v14, v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_or_b32_e32 v20, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_or_b32_e32 v18, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v14, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_or_b32_e32 v10, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v6, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v2, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[5:6], 16 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 +; SI-NEXT: v_lshr_b64 v[23:24], v[9:10], 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshr_b64 v[26:27], v[13:14], 16 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; SI-NEXT: v_lshr_b64 v[27:28], v[17:18], 16 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_lshr_b64 v[24:25], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: .LBB23_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v23 -; SI-NEXT: v_mov_b32_e32 v5, v24 -; SI-NEXT: v_mov_b32_e32 v9, v25 -; SI-NEXT: v_mov_b32_e32 v13, v26 -; SI-NEXT: v_mov_b32_e32 v17, v27 +; SI-NEXT: v_mov_b32_e32 v1, v27 +; SI-NEXT: v_mov_b32_e32 v5, v22 +; SI-NEXT: v_mov_b32_e32 v9, v23 +; SI-NEXT: v_mov_b32_e32 v13, v24 +; SI-NEXT: v_mov_b32_e32 v17, v25 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: s_branch .LBB23_2 @@ -5441,61 +5443,61 @@ define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s5, s25, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x200 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s5, s26, 16 -; VI-NEXT: v_add_f16_e32 v1, s25, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_lshr_b32 s5, s24, 16 -; VI-NEXT: v_or_b32_e32 v9, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s24, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s23, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s22, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s21, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v3, s26, v0 -; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, s20, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s19, 16 -; VI-NEXT: v_or_b32_e32 v10, v3, v4 -; VI-NEXT: v_or_b32_e32 v4, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s19, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_add_f16_e32 v1, s18, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v1, v2 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_add_f16_sdwa v12, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: s_lshr_b32 s6, s26, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: v_mov_b32_e32 v10, s6 +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: s_lshr_b32 s5, s25, 16 +; VI-NEXT: v_add_f16_e32 v9, s26, v0 +; VI-NEXT: v_add_f16_sdwa v10, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_or_b32_e32 v10, v9, v10 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: v_add_f16_sdwa v19, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s4, s24, 16 +; VI-NEXT: v_add_f16_e32 v8, s25, v0 +; VI-NEXT: v_add_f16_sdwa v9, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v8, v9 +; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: v_add_f16_e32 v11, s16, v0 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s17, v0 -; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_add_f16_sdwa v12, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v13, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v14, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v15, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_add_f16_sdwa v16, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s21, v0 +; VI-NEXT: v_add_f16_sdwa v17, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s22, v0 +; VI-NEXT: v_add_f16_sdwa v18, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, s23, v0 +; VI-NEXT: v_add_f16_sdwa v8, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s24, v0 +; VI-NEXT: v_or_b32_e32 v8, v0, v8 +; VI-NEXT: v_or_b32_e32 v7, v7, v19 +; VI-NEXT: v_or_b32_e32 v6, v6, v18 +; VI-NEXT: v_or_b32_e32 v5, v5, v17 +; VI-NEXT: v_or_b32_e32 v4, v4, v16 +; VI-NEXT: v_or_b32_e32 v3, v3, v15 +; VI-NEXT: v_or_b32_e32 v2, v2, v14 +; VI-NEXT: v_or_b32_e32 v1, v1, v13 ; VI-NEXT: v_or_b32_e32 v0, v11, v12 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll index 4f6801a4dcdfd..2621ad45c31ea 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll @@ -2315,20 +2315,20 @@ define inreg <12 x i32> @bitcast_v24i16_to_v12i32_scalar(<24 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_or_b32_e32 v7, v0, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v8, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v9, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v10, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 ; SI-NEXT: v_or_b32_e32 v11, v0, v17 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -3167,36 +3167,36 @@ define <12 x i32> @bitcast_v24f16_to_v12i32(<24 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: .LBB18_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 @@ -5523,20 +5523,20 @@ define inreg <12 x float> @bitcast_v24i16_to_v12f32_scalar(<24 x i16> inreg %a, ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_or_b32_e32 v7, v0, v21 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v15 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v19 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v8, v0, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v9, v0, v19 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v13 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v10, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v12 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 ; SI-NEXT: v_or_b32_e32 v11, v0, v17 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -6384,36 +6384,36 @@ define <12 x float> @bitcast_v24f16_to_v12f32(<24 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: .LBB34_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 @@ -7479,18 +7479,20 @@ define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v6f64_to_v24i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v34, v11 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v32, v9 -; SI-NEXT: v_mov_b32_e32 v31, v8 -; SI-NEXT: v_mov_b32_e32 v30, v7 -; SI-NEXT: v_mov_b32_e32 v29, v6 -; SI-NEXT: v_mov_b32_e32 v28, v5 -; SI-NEXT: v_mov_b32_e32 v27, v4 -; SI-NEXT: v_mov_b32_e32 v26, v3 -; SI-NEXT: v_mov_b32_e32 v25, v2 +; SI-NEXT: v_mov_b32_e32 v25, v11 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: v_mov_b32_e32 v27, v9 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v29, v7 +; SI-NEXT: v_mov_b32_e32 v28, v6 +; SI-NEXT: v_mov_b32_e32 v31, v5 +; SI-NEXT: v_mov_b32_e32 v30, v4 +; SI-NEXT: v_mov_b32_e32 v33, v3 +; SI-NEXT: v_mov_b32_e32 v32, v2 +; SI-NEXT: v_mov_b32_e32 v35, v1 +; SI-NEXT: v_mov_b32_e32 v34, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 @@ -7506,54 +7508,54 @@ define <24 x i16> @bitcast_v6f64_to_v24i16(<6 x double> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v34, v33, 16 -; SI-NEXT: v_alignbit_b32 v17, v32, v31, 16 -; SI-NEXT: v_alignbit_b32 v13, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v9, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v5, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_alignbit_b32 v21, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v17, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v13, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v9, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v5, v33, v32, 16 +; SI-NEXT: v_alignbit_b32 v1, v35, v34, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 ; SI-NEXT: .LBB40_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB40_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 -; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; SI-NEXT: v_alignbit_b32 v21, v34, v33, 16 -; SI-NEXT: v_alignbit_b32 v17, v32, v31, 16 -; SI-NEXT: v_alignbit_b32 v13, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v9, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v5, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 +; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; SI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; SI-NEXT: v_alignbit_b32 v21, v25, v24, 16 +; SI-NEXT: v_alignbit_b32 v17, v27, v26, 16 +; SI-NEXT: v_alignbit_b32 v13, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v9, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v5, v33, v32, 16 +; SI-NEXT: v_alignbit_b32 v1, v35, v34, 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v4, v25 -; SI-NEXT: v_mov_b32_e32 v6, v26 -; SI-NEXT: v_mov_b32_e32 v8, v27 -; SI-NEXT: v_mov_b32_e32 v10, v28 -; SI-NEXT: v_mov_b32_e32 v12, v29 -; SI-NEXT: v_mov_b32_e32 v14, v30 -; SI-NEXT: v_mov_b32_e32 v16, v31 -; SI-NEXT: v_mov_b32_e32 v18, v32 -; SI-NEXT: v_mov_b32_e32 v20, v33 -; SI-NEXT: v_mov_b32_e32 v22, v34 -; SI-NEXT: v_mov_b32_e32 v1, v24 +; SI-NEXT: v_mov_b32_e32 v0, v34 +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v4, v32 +; SI-NEXT: v_mov_b32_e32 v6, v33 +; SI-NEXT: v_mov_b32_e32 v8, v30 +; SI-NEXT: v_mov_b32_e32 v10, v31 +; SI-NEXT: v_mov_b32_e32 v12, v28 +; SI-NEXT: v_mov_b32_e32 v14, v29 +; SI-NEXT: v_mov_b32_e32 v16, v26 +; SI-NEXT: v_mov_b32_e32 v18, v27 +; SI-NEXT: v_mov_b32_e32 v20, v24 +; SI-NEXT: v_mov_b32_e32 v22, v25 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v24i16: @@ -8136,20 +8138,20 @@ define inreg <6 x double> @bitcast_v24i16_to_v6f64_scalar(<24 x i16> inreg %a, i ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_or_b32_e32 v7, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v8, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v9, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v10, v0, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v24 ; SI-NEXT: v_or_b32_e32 v11, v0, v21 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -8674,26 +8676,26 @@ define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, ; SI-NEXT: v_add_f64 v[8:9], s[24:25], 1.0 ; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -8703,11 +8705,11 @@ define inreg <24 x half> @bitcast_v6f64_to_v24f16_scalar(<6 x double> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 ; SI-NEXT: .LBB45_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8952,36 +8954,36 @@ define <6 x double> @bitcast_v24f16_to_v6f64(<24 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 @@ -9227,36 +9229,36 @@ define inreg <6 x double> @bitcast_v24f16_to_v6f64_scalar(<24 x half> inreg %a, ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 @@ -10190,20 +10192,20 @@ define inreg <6 x i64> @bitcast_v24i16_to_v6i64_scalar(<24 x i16> inreg %a, i32 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: v_or_b32_e32 v7, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v8, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v9, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v10, v0, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v24 ; SI-NEXT: v_or_b32_e32 v11, v0, v21 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -11051,36 +11053,36 @@ define <6 x i64> @bitcast_v24f16_to_v6i64(<24 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB54_2 ; SI-NEXT: .LBB54_4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 @@ -11326,36 +11328,36 @@ define inreg <6 x i64> @bitcast_v24f16_to_v6i64_scalar(<24 x half> inreg %a, i32 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v30 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v28 @@ -12465,137 +12467,139 @@ define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i ; SI-LABEL: bitcast_v24f16_to_v24i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v13, v8 -; SI-NEXT: v_mov_b32_e32 v17, v7 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_mov_b32_e32 v12, v7 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_or_b32_e32 v14, v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v27 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 -; SI-NEXT: v_or_b32_e32 v18, v18, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_or_b32_e32 v22, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v18, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v14, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v10, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_or_b32_e32 v6, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v26 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v2, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_lshr_b64 v[27:28], v[1:2], 16 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 16 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshr_b64 v[28:29], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[5:6], 16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_lshr_b64 v[27:28], v[13:14], 16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[21:22], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 @@ -12603,12 +12607,12 @@ define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v27 -; SI-NEXT: v_mov_b32_e32 v5, v25 -; SI-NEXT: v_mov_b32_e32 v9, v28 -; SI-NEXT: v_mov_b32_e32 v13, v29 -; SI-NEXT: v_mov_b32_e32 v17, v30 -; SI-NEXT: v_mov_b32_e32 v21, v31 +; SI-NEXT: v_mov_b32_e32 v1, v25 +; SI-NEXT: v_mov_b32_e32 v5, v26 +; SI-NEXT: v_mov_b32_e32 v9, v31 +; SI-NEXT: v_mov_b32_e32 v13, v27 +; SI-NEXT: v_mov_b32_e32 v17, v28 +; SI-NEXT: v_mov_b32_e32 v21, v29 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: s_branch .LBB59_2 @@ -12621,66 +12625,66 @@ define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB59_4 ; VI-NEXT: .LBB59_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s5, s26, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x200 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s5, s27, 16 -; VI-NEXT: v_add_f16_e32 v1, s26, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_lshr_b32 s5, s25, 16 -; VI-NEXT: v_or_b32_e32 v10, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s25, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s24, 16 -; VI-NEXT: v_or_b32_e32 v9, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s24, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s23, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s22, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s21, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v3, s27, v0 -; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, s20, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s19, 16 -; VI-NEXT: v_or_b32_e32 v11, v3, v4 -; VI-NEXT: v_or_b32_e32 v4, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s19, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_add_f16_e32 v1, s18, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v1, v2 +; VI-NEXT: s_lshr_b32 s8, s27, 16 +; VI-NEXT: v_mov_b32_e32 v0, 0x200 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_add_f16_sdwa v13, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v11, s8 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: s_lshr_b32 s7, s26, 16 +; VI-NEXT: v_add_f16_e32 v10, s27, v0 +; VI-NEXT: v_add_f16_sdwa v11, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v11, v10, v11 +; VI-NEXT: v_mov_b32_e32 v10, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: s_lshr_b32 s6, s25, 16 +; VI-NEXT: v_add_f16_e32 v9, s26, v0 +; VI-NEXT: v_add_f16_sdwa v10, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_or_b32_e32 v10, v9, v10 +; VI-NEXT: v_mov_b32_e32 v9, s6 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_lshr_b32 s4, s22, 16 +; VI-NEXT: s_lshr_b32 s5, s24, 16 +; VI-NEXT: v_add_f16_e32 v8, s25, v0 +; VI-NEXT: v_add_f16_sdwa v9, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: v_or_b32_e32 v9, v8, v9 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: v_add_f16_sdwa v19, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s4, s23, 16 +; VI-NEXT: v_add_f16_e32 v7, s24, v0 +; VI-NEXT: v_add_f16_sdwa v8, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v7, v8 +; VI-NEXT: v_mov_b32_e32 v7, s4 ; VI-NEXT: v_add_f16_e32 v12, s16, v0 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s17, v0 -; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_add_f16_sdwa v13, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v14, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v15, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v16, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_add_f16_sdwa v17, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, s21, v0 +; VI-NEXT: v_add_f16_sdwa v18, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, s22, v0 +; VI-NEXT: v_add_f16_sdwa v7, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s23, v0 +; VI-NEXT: v_or_b32_e32 v7, v0, v7 +; VI-NEXT: v_or_b32_e32 v6, v6, v19 +; VI-NEXT: v_or_b32_e32 v5, v5, v18 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_or_b32_e32 v3, v3, v16 +; VI-NEXT: v_or_b32_e32 v2, v2, v15 +; VI-NEXT: v_or_b32_e32 v1, v1, v14 ; VI-NEXT: v_or_b32_e32 v0, v12, v13 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB59_3: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll index 7fbc631c10e34..cb287c95d01ab 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll @@ -2548,30 +2548,30 @@ define inreg <14 x i32> @bitcast_v28i16_to_v14i32_scalar(<28 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: v_or_b32_e32 v7, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v8, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v9, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v10, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v11, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v12, v0, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: v_or_b32_e32 v13, v0, v23 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -6107,30 +6107,30 @@ define inreg <14 x float> @bitcast_v28i16_to_v14f32_scalar(<28 x i16> inreg %a, ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: v_or_b32_e32 v7, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v8, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v9, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v10, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v11, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v12, v0, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: v_or_b32_e32 v13, v0, v23 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -9027,30 +9027,30 @@ define inreg <7 x i64> @bitcast_v28i16_to_v7i64_scalar(<28 x i16> inreg %a, i32 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: v_or_b32_e32 v7, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v8, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v9, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v10, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v11, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v12, v0, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: v_or_b32_e32 v13, v0, v23 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -10637,20 +10637,22 @@ define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v7f64_to_v28i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_mov_b32_e32 v48, v12 -; SI-NEXT: v_mov_b32_e32 v38, v11 -; SI-NEXT: v_mov_b32_e32 v37, v10 -; SI-NEXT: v_mov_b32_e32 v36, v9 -; SI-NEXT: v_mov_b32_e32 v35, v8 -; SI-NEXT: v_mov_b32_e32 v34, v7 -; SI-NEXT: v_mov_b32_e32 v33, v6 -; SI-NEXT: v_mov_b32_e32 v32, v5 -; SI-NEXT: v_mov_b32_e32 v31, v4 -; SI-NEXT: v_mov_b32_e32 v30, v3 -; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v29, v13 +; SI-NEXT: v_mov_b32_e32 v28, v12 +; SI-NEXT: v_mov_b32_e32 v31, v11 +; SI-NEXT: v_mov_b32_e32 v30, v10 +; SI-NEXT: v_mov_b32_e32 v33, v9 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: v_mov_b32_e32 v35, v7 +; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: v_mov_b32_e32 v37, v5 +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v39, v3 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v49, v1 +; SI-NEXT: v_mov_b32_e32 v48, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 @@ -10668,61 +10670,61 @@ define <28 x i16> @bitcast_v7f64_to_v28i16(<7 x double> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v25, v49, v48, 16 -; SI-NEXT: v_alignbit_b32 v21, v38, v37, 16 -; SI-NEXT: v_alignbit_b32 v17, v36, v35, 16 -; SI-NEXT: v_alignbit_b32 v13, v34, v33, 16 -; SI-NEXT: v_alignbit_b32 v9, v32, v31, 16 -; SI-NEXT: v_alignbit_b32 v5, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v28, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_alignbit_b32 v25, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v21, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v17, v33, v32, 16 +; SI-NEXT: v_alignbit_b32 v13, v35, v34, 16 +; SI-NEXT: v_alignbit_b32 v9, v37, v36, 16 +; SI-NEXT: v_alignbit_b32 v5, v39, v38, 16 +; SI-NEXT: v_alignbit_b32 v1, v49, v48, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v49 ; SI-NEXT: .LBB48_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; SI-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 -; SI-NEXT: v_add_f64 v[35:36], v[35:36], 1.0 ; SI-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 -; SI-NEXT: v_add_f64 v[37:38], v[37:38], 1.0 -; SI-NEXT: v_alignbit_b32 v25, v49, v48, 16 -; SI-NEXT: v_alignbit_b32 v21, v38, v37, 16 -; SI-NEXT: v_alignbit_b32 v17, v36, v35, 16 -; SI-NEXT: v_alignbit_b32 v13, v34, v33, 16 -; SI-NEXT: v_alignbit_b32 v9, v32, v31, 16 -; SI-NEXT: v_alignbit_b32 v5, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v28, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_add_f64 v[38:39], v[38:39], 1.0 +; SI-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 +; SI-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 +; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; SI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; SI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; SI-NEXT: v_alignbit_b32 v25, v29, v28, 16 +; SI-NEXT: v_alignbit_b32 v21, v31, v30, 16 +; SI-NEXT: v_alignbit_b32 v17, v33, v32, 16 +; SI-NEXT: v_alignbit_b32 v13, v35, v34, 16 +; SI-NEXT: v_alignbit_b32 v9, v37, v36, 16 +; SI-NEXT: v_alignbit_b32 v5, v39, v38, 16 +; SI-NEXT: v_alignbit_b32 v1, v49, v48, 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v49 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v4, v29 -; SI-NEXT: v_mov_b32_e32 v6, v30 -; SI-NEXT: v_mov_b32_e32 v8, v31 -; SI-NEXT: v_mov_b32_e32 v10, v32 -; SI-NEXT: v_mov_b32_e32 v12, v33 -; SI-NEXT: v_mov_b32_e32 v14, v34 -; SI-NEXT: v_mov_b32_e32 v16, v35 -; SI-NEXT: v_mov_b32_e32 v18, v36 -; SI-NEXT: v_mov_b32_e32 v20, v37 -; SI-NEXT: v_mov_b32_e32 v22, v38 -; SI-NEXT: v_mov_b32_e32 v24, v48 -; SI-NEXT: v_mov_b32_e32 v26, v49 -; SI-NEXT: v_mov_b32_e32 v1, v28 +; SI-NEXT: v_mov_b32_e32 v0, v48 +; SI-NEXT: v_mov_b32_e32 v2, v49 +; SI-NEXT: v_mov_b32_e32 v4, v38 +; SI-NEXT: v_mov_b32_e32 v6, v39 +; SI-NEXT: v_mov_b32_e32 v8, v36 +; SI-NEXT: v_mov_b32_e32 v10, v37 +; SI-NEXT: v_mov_b32_e32 v12, v34 +; SI-NEXT: v_mov_b32_e32 v14, v35 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: v_mov_b32_e32 v18, v33 +; SI-NEXT: v_mov_b32_e32 v20, v30 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: v_mov_b32_e32 v24, v28 +; SI-NEXT: v_mov_b32_e32 v26, v29 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v28i16: @@ -11353,30 +11355,30 @@ define inreg <7 x double> @bitcast_v28i16_to_v7f64_scalar(<28 x i16> inreg %a, i ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: v_or_b32_e32 v7, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v8, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v9, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v10, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v11, v0, v25 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v12, v0, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: v_or_b32_e32 v13, v0, v23 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -11958,11 +11960,12 @@ define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, ; SI-NEXT: v_add_f64 v[10:11], s[26:27], 1.0 ; SI-NEXT: v_add_f64 v[12:13], s[28:29], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 @@ -11970,18 +11973,17 @@ define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -11992,12 +11994,12 @@ define inreg <28 x half> @bitcast_v7f64_to_v28f16_scalar(<7 x double> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 ; SI-NEXT: .LBB53_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] @@ -13867,159 +13869,156 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-LABEL: bitcast_v28f16_to_v28i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v17, v12 -; SI-NEXT: v_mov_b32_e32 v21, v11 -; SI-NEXT: v_mov_b32_e32 v22, v8 -; SI-NEXT: v_mov_b32_e32 v25, v7 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v26, v3 -; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v18, v3 +; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v15, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v29 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_or_b32_e32 v18, v18, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 -; SI-NEXT: v_or_b32_e32 v14, v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 -; SI-NEXT: v_or_b32_e32 v22, v22, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v32 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_or_b32_e32 v26, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v22, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v18, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v14, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v10, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_or_b32_e32 v6, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v2, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[29:30], v[1:2], 16 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshr_b64 v[34:35], v[1:2], 16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; SI-NEXT: v_lshr_b64 v[35:36], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[13:14], 16 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v26, v26, v28 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshr_b64 v[31:32], v[17:18], 16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshr_b64 v[29:30], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[25:26], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 @@ -14028,13 +14027,13 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v34 -; SI-NEXT: v_mov_b32_e32 v5, v32 -; SI-NEXT: v_mov_b32_e32 v9, v29 -; SI-NEXT: v_mov_b32_e32 v13, v35 -; SI-NEXT: v_mov_b32_e32 v17, v30 -; SI-NEXT: v_mov_b32_e32 v21, v36 -; SI-NEXT: v_mov_b32_e32 v25, v37 +; SI-NEXT: v_mov_b32_e32 v1, v29 +; SI-NEXT: v_mov_b32_e32 v5, v35 +; SI-NEXT: v_mov_b32_e32 v9, v36 +; SI-NEXT: v_mov_b32_e32 v13, v30 +; SI-NEXT: v_mov_b32_e32 v17, v31 +; SI-NEXT: v_mov_b32_e32 v21, v32 +; SI-NEXT: v_mov_b32_e32 v25, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: s_branch .LBB59_2 @@ -14048,76 +14047,76 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB59_4 ; VI-NEXT: .LBB59_2: ; %cmp.true -; VI-NEXT: s_lshr_b32 s5, s28, 16 +; VI-NEXT: s_lshr_b32 s12, s29, 16 ; VI-NEXT: v_mov_b32_e32 v0, 0x200 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s5, s29, 16 -; VI-NEXT: v_add_f16_e32 v1, s28, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_lshr_b32 s5, s27, 16 -; VI-NEXT: v_or_b32_e32 v12, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s27, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s26, 16 -; VI-NEXT: v_or_b32_e32 v11, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s26, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s25, 16 -; VI-NEXT: v_or_b32_e32 v10, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s25, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s24, 16 -; VI-NEXT: v_or_b32_e32 v9, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s24, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s23, 16 -; VI-NEXT: v_or_b32_e32 v8, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s23, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v7, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s22, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s21, 16 -; VI-NEXT: v_or_b32_e32 v6, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s21, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s20, 16 -; VI-NEXT: v_or_b32_e32 v5, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v3, s29, v0 -; VI-NEXT: v_add_f16_sdwa v4, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, s20, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s19, 16 -; VI-NEXT: v_or_b32_e32 v13, v3, v4 -; VI-NEXT: v_or_b32_e32 v4, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_add_f16_e32 v1, s19, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_lshr_b32 s5, s18, 16 -; VI-NEXT: v_or_b32_e32 v3, v1, v2 -; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v13, s12 +; VI-NEXT: s_lshr_b32 s11, s28, 16 +; VI-NEXT: v_add_f16_e32 v12, s29, v0 +; VI-NEXT: v_add_f16_sdwa v13, v13, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v12, v13 +; VI-NEXT: v_mov_b32_e32 v12, s11 +; VI-NEXT: s_lshr_b32 s10, s27, 16 +; VI-NEXT: v_add_f16_e32 v11, s28, v0 +; VI-NEXT: v_add_f16_sdwa v12, v12, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v11, v12 +; VI-NEXT: v_mov_b32_e32 v11, s10 +; VI-NEXT: s_lshr_b32 s9, s26, 16 +; VI-NEXT: v_add_f16_e32 v10, s27, v0 +; VI-NEXT: v_add_f16_sdwa v11, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v10, v11 +; VI-NEXT: v_mov_b32_e32 v10, s9 +; VI-NEXT: s_lshr_b32 s8, s25, 16 +; VI-NEXT: v_add_f16_e32 v9, s26, v0 +; VI-NEXT: v_add_f16_sdwa v10, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v9, v10 +; VI-NEXT: v_mov_b32_e32 v9, s8 ; VI-NEXT: s_lshr_b32 s4, s16, 16 -; VI-NEXT: v_add_f16_e32 v1, s18, v0 -; VI-NEXT: v_add_f16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v1, v2 +; VI-NEXT: s_lshr_b32 s7, s24, 16 +; VI-NEXT: v_add_f16_e32 v8, s25, v0 +; VI-NEXT: v_add_f16_sdwa v9, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_lshr_b32 s4, s17, 16 -; VI-NEXT: v_add_f16_sdwa v15, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_or_b32_e32 v9, v8, v9 +; VI-NEXT: v_mov_b32_e32 v8, s7 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_lshr_b32 s4, s18, 16 +; VI-NEXT: s_lshr_b32 s6, s23, 16 +; VI-NEXT: v_add_f16_e32 v7, s24, v0 +; VI-NEXT: v_add_f16_sdwa v8, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: s_lshr_b32 s4, s19, 16 +; VI-NEXT: v_or_b32_e32 v8, v7, v8 +; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s4, s20, 16 +; VI-NEXT: s_lshr_b32 s5, s22, 16 +; VI-NEXT: v_add_f16_e32 v6, s23, v0 +; VI-NEXT: v_add_f16_sdwa v7, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: v_or_b32_e32 v7, v6, v7 +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_add_f16_sdwa v19, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s4, s21, 16 +; VI-NEXT: v_add_f16_e32 v5, s22, v0 +; VI-NEXT: v_add_f16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v5, v6 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_add_f16_e32 v14, s16, v0 -; VI-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s17, v0 -; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_add_f16_sdwa v15, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, s17, v0 +; VI-NEXT: v_add_f16_sdwa v16, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s18, v0 +; VI-NEXT: v_add_f16_sdwa v17, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, s19, v0 +; VI-NEXT: v_add_f16_sdwa v18, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, s20, v0 +; VI-NEXT: v_add_f16_sdwa v5, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s21, v0 +; VI-NEXT: v_or_b32_e32 v5, v0, v5 +; VI-NEXT: v_or_b32_e32 v4, v4, v19 +; VI-NEXT: v_or_b32_e32 v3, v3, v18 +; VI-NEXT: v_or_b32_e32 v2, v2, v17 +; VI-NEXT: v_or_b32_e32 v1, v1, v16 ; VI-NEXT: v_or_b32_e32 v0, v14, v15 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB59_3: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index a7f89579b5ce0..7be3a8214a3f2 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -2146,88 +2146,88 @@ define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_mov_b32_e32 v33, v1 ; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: v_mov_b32_e32 v34, s16 -; SI-NEXT: v_mov_b32_e32 v35, s17 -; SI-NEXT: v_mov_b32_e32 v36, s18 -; SI-NEXT: v_mov_b32_e32 v37, s19 -; SI-NEXT: v_mov_b32_e32 v38, s20 -; SI-NEXT: v_mov_b32_e32 v39, s21 +; SI-NEXT: v_mov_b32_e32 v54, s16 +; SI-NEXT: v_mov_b32_e32 v55, s17 +; SI-NEXT: v_mov_b32_e32 v52, s18 +; SI-NEXT: v_mov_b32_e32 v53, s19 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v51, s21 ; SI-NEXT: v_mov_b32_e32 v48, s22 ; SI-NEXT: v_mov_b32_e32 v49, s23 -; SI-NEXT: v_mov_b32_e32 v50, s24 -; SI-NEXT: v_mov_b32_e32 v51, s25 -; SI-NEXT: v_mov_b32_e32 v52, s26 -; SI-NEXT: v_mov_b32_e32 v53, s27 -; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: v_mov_b32_e32 v38, s24 +; SI-NEXT: v_mov_b32_e32 v39, s25 +; SI-NEXT: v_mov_b32_e32 v36, s26 +; SI-NEXT: v_mov_b32_e32 v37, s27 +; SI-NEXT: v_mov_b32_e32 v34, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v55, s29 +; SI-NEXT: v_mov_b32_e32 v35, s29 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v39 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 ; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[38:39], 16 ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 ; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 ; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[38:39], 16 ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v39 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v34 -; SI-NEXT: v_mov_b32_e32 v2, v35 -; SI-NEXT: v_mov_b32_e32 v4, v36 -; SI-NEXT: v_mov_b32_e32 v6, v37 -; SI-NEXT: v_mov_b32_e32 v8, v38 -; SI-NEXT: v_mov_b32_e32 v10, v39 +; SI-NEXT: v_mov_b32_e32 v0, v54 +; SI-NEXT: v_mov_b32_e32 v2, v55 +; SI-NEXT: v_mov_b32_e32 v4, v52 +; SI-NEXT: v_mov_b32_e32 v6, v53 +; SI-NEXT: v_mov_b32_e32 v8, v50 +; SI-NEXT: v_mov_b32_e32 v10, v51 ; SI-NEXT: v_mov_b32_e32 v12, v48 ; SI-NEXT: v_mov_b32_e32 v14, v49 -; SI-NEXT: v_mov_b32_e32 v16, v50 -; SI-NEXT: v_mov_b32_e32 v18, v51 -; SI-NEXT: v_mov_b32_e32 v20, v52 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v26, v55 +; SI-NEXT: v_mov_b32_e32 v16, v38 +; SI-NEXT: v_mov_b32_e32 v18, v39 +; SI-NEXT: v_mov_b32_e32 v20, v36 +; SI-NEXT: v_mov_b32_e32 v22, v37 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v26, v35 ; SI-NEXT: v_mov_b32_e32 v28, v32 ; SI-NEXT: v_mov_b32_e32 v30, v33 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2244,10 +2244,10 @@ define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v16i32_to_v32i16_scalar: @@ -2749,40 +2749,40 @@ define inreg <16 x i32> @bitcast_v32i16_to_v16i32_scalar(<32 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: v_or_b32_e32 v7, v0, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v8, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v9, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v10, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v11, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v12, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v13, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v32 ; SI-NEXT: v_or_b32_e32 v15, v0, v17 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -3890,41 +3890,41 @@ define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 @@ -5303,18 +5303,18 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB22_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -6594,53 +6594,53 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s16 ; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 -; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 -; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 -; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v58 +; SI-NEXT: v_lshr_b64 v[0:1], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v63 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v62 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v61 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v60 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v59 ; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 ; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 @@ -6658,46 +6658,46 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 ; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_mov_b32_e32 v14, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v15, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 ; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 ; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -6790,8 +6790,8 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-LABEL: bitcast_v32bf16_to_v16i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v10, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -6804,11 +6804,11 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB23_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB23_3 @@ -8581,9 +8581,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -8600,108 +8598,107 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr61 ; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; kill: killed $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB24_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] -; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v16 ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; VI-NEXT: v_mov_b32_e32 v26, v22 -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; VI-NEXT: .LBB24_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB24_4 @@ -8717,176 +8714,173 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 ; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 ; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] ; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 ; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v25, 24, v16 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; VI-NEXT: .LBB24_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v27, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v28 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v26, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; VI-NEXT: v_or_b32_sdwa v2, v2, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 ; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v21 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 ; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v36 ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v30 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 ; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 -; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -8912,9 +8906,7 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; kill: killed $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -8931,108 +8923,107 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr27 -; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB24_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v16 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v16 ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v26, v23 -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX9-NEXT: .LBB24_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB24_4 @@ -9051,158 +9042,155 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 ; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 ; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] ; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] -; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 24, v16 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX9-NEXT: .LBB24_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v27, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v26, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v46 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v42 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21 ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54 ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v49 ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v48 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19 ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v31 ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v30 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 ; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v26 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -9850,8 +9838,8 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_writelane_b32 v18, s84, 28 -; SI-NEXT: v_readfirstlane_b32 s20, v4 -; SI-NEXT: v_readfirstlane_b32 s21, v5 +; SI-NEXT: v_readfirstlane_b32 s24, v4 +; SI-NEXT: v_readfirstlane_b32 s25, v5 ; SI-NEXT: v_readfirstlane_b32 s16, v6 ; SI-NEXT: v_readfirstlane_b32 s17, v7 ; SI-NEXT: v_readfirstlane_b32 s14, v8 @@ -9891,12 +9879,12 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s80, s17, 24 ; SI-NEXT: s_lshr_b32 s81, s17, 16 ; SI-NEXT: s_lshr_b32 s82, s17, 8 -; SI-NEXT: s_lshr_b32 s83, s21, 24 -; SI-NEXT: s_lshr_b32 s84, s21, 16 -; SI-NEXT: s_lshr_b32 s85, s21, 8 +; SI-NEXT: s_lshr_b32 s83, s25, 24 +; SI-NEXT: s_lshr_b32 s84, s25, 16 +; SI-NEXT: s_lshr_b32 s85, s25, 8 ; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 8 ; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 ; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8 @@ -9908,20 +9896,20 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 ; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[14:15], 8 ; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 ; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[24:25], 8 ; SI-NEXT: s_cbranch_execnz .LBB25_3 ; SI-NEXT: .LBB25_2: ; %cmp.true -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s15, s15, 3 @@ -9937,8 +9925,8 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: s_add_i32 s5, s5, 3 ; SI-NEXT: s_add_i32 s4, s4, 3 ; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 8 ; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 ; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 ; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8 @@ -9950,6 +9938,16 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 ; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[24:25], 8 ; SI-NEXT: s_lshr_b32 s38, s5, 24 ; SI-NEXT: s_lshr_b32 s39, s5, 16 ; SI-NEXT: s_lshr_b32 s48, s5, 8 @@ -9971,75 +9969,66 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s80, s17, 24 ; SI-NEXT: s_lshr_b32 s81, s17, 16 ; SI-NEXT: s_lshr_b32 s82, s17, 8 -; SI-NEXT: s_lshr_b32 s83, s21, 24 -; SI-NEXT: s_lshr_b32 s84, s21, 16 -; SI-NEXT: s_lshr_b32 s85, s21, 8 -; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[34:35], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[36:37], s[20:21], 8 +; SI-NEXT: s_lshr_b32 s83, s25, 24 +; SI-NEXT: s_lshr_b32 s84, s25, 16 +; SI-NEXT: s_lshr_b32 s85, s25, 8 ; SI-NEXT: .LBB25_3: ; %end -; SI-NEXT: s_lshl_b32 s19, s36, 8 -; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: s_and_b32 s20, s34, 0xff +; SI-NEXT: s_and_b32 s19, s24, 0xff +; SI-NEXT: s_lshl_b32 s21, s36, 8 +; SI-NEXT: s_or_b32 s19, s19, s21 +; SI-NEXT: s_and_b32 s21, s34, 0xff +; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_lshl_b32 s23, s30, 24 -; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_or_b32 s20, s23, s20 ; SI-NEXT: s_and_b32 s19, s19, 0xffff -; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_or_b32 s21, s23, s21 +; SI-NEXT: s_or_b32 s19, s19, s21 ; SI-NEXT: v_mov_b32_e32 v1, s19 -; SI-NEXT: s_and_b32 s19, s21, 0xff -; SI-NEXT: s_lshl_b32 s20, s85, 8 -; SI-NEXT: s_or_b32 s19, s19, s20 -; SI-NEXT: s_and_b32 s20, s84, 0xff -; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_lshl_b32 s21, s83, 24 -; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_and_b32 s19, s25, 0xff +; SI-NEXT: s_lshl_b32 s21, s85, 8 +; SI-NEXT: s_or_b32 s19, s19, s21 +; SI-NEXT: s_and_b32 s21, s84, 0xff +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_lshl_b32 s23, s83, 24 ; SI-NEXT: s_and_b32 s19, s19, 0xffff -; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_or_b32 s21, s23, s21 +; SI-NEXT: s_or_b32 s19, s19, s21 ; SI-NEXT: v_mov_b32_e32 v2, s19 -; SI-NEXT: s_lshl_b32 s19, s94, 8 ; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s19, s94, 8 ; SI-NEXT: s_or_b32 s16, s16, s19 ; SI-NEXT: s_and_b32 s19, s92, 0xff -; SI-NEXT: s_lshl_b32 s20, s90, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: s_lshl_b32 s21, s90, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s19, s21, s19 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: s_or_b32 s16, s16, s19 -; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s17, 0xff ; SI-NEXT: s_lshl_b32 s17, s82, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s81, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s19, s80, 24 -; SI-NEXT: s_or_b32 s17, s19, s17 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s17, s19, s17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s16, s78, 8 +; SI-NEXT: s_lshl_b32 s16, s88, 8 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: s_and_b32 s16, s76, 0xff +; SI-NEXT: s_and_b32 s16, s78, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s74, 24 -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_lshl_b32 s17, s76, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -10059,7 +10048,7 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s14, s88, 8 +; SI-NEXT: s_lshl_b32 s14, s74, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: s_and_b32 s14, s72, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 @@ -10163,9 +10152,9 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s6, s24, 8 +; SI-NEXT: s_lshl_b32 s6, s22, 8 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_and_b32 s6, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s7, s18, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -10237,42 +10226,42 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr82 ; SI-NEXT: ; implicit-def: $sgpr81 ; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr67 ; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: s_branch .LBB25_2 ; ; VI-LABEL: bitcast_v16i32_to_v64i8_scalar: @@ -11537,19 +11526,19 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 @@ -11564,24 +11553,24 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v17 ; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 ; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v13 ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v17 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v2 ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v4 ; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v6 ; SI-NEXT: s_waitcnt vmcnt(13) @@ -11599,9 +11588,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v53 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v55 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v40 ; SI-NEXT: s_waitcnt vmcnt(3) @@ -11617,47 +11606,43 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v47 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB26_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 ; SI-NEXT: v_or_b32_e32 v0, v0, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v57 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v62, v9 @@ -11672,16 +11657,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v12, v54, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr52 @@ -11691,67 +11673,79 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v16, v4 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v18 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -11798,10 +11792,10 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v40 -; SI-NEXT: v_or_b32_e32 v13, v13, v19 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v21 ; SI-NEXT: v_or_b32_e32 v14, v14, v25 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 @@ -11812,24 +11806,19 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v23 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 @@ -11856,19 +11845,20 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: .LBB26_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB26_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -11882,17 +11872,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v35, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -11912,75 +11898,85 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_mov_b32 s7, 0x3000000 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v16, v4 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_or_b32_e32 v4, v18, v4 @@ -12046,11 +12042,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v13, v19, v13 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v21 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_or_b32_e32 v14, v25, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 @@ -12068,15 +12064,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v23, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 ; SI-NEXT: .LBB26_4: ; %end @@ -12144,7 +12134,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:64 ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 @@ -12154,7 +12144,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 ; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 @@ -12184,7 +12174,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v26 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v38 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v30 ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v39 ; VI-NEXT: s_waitcnt vmcnt(7) @@ -12207,7 +12197,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 @@ -12223,20 +12213,18 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v30, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr53 @@ -12247,6 +12235,8 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -12256,43 +12246,43 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v2, v2, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -12388,10 +12378,8 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12401,7 +12389,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v12, 3, v38 +; VI-NEXT: v_add_u16_e32 v12, 3, v30 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v13, 3, v58 @@ -12412,6 +12400,8 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -12421,56 +12411,56 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_add_u16_e32 v1, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 -; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v2, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v4, v4, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v5, v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v53 ; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v8, 3, v8 ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_sdwa v15, v17, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 -; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v15, v17, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -12598,7 +12588,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 @@ -12608,7 +12598,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 ; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 @@ -12642,7 +12632,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v26 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v38 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v30 ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v39 ; GFX9-NEXT: s_waitcnt vmcnt(7) @@ -12665,7 +12655,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 @@ -12681,20 +12671,18 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v30, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr53 @@ -12705,6 +12693,8 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -12714,43 +12704,43 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -12846,10 +12836,8 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12859,7 +12847,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v13, 3, v58 @@ -12871,6 +12859,8 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v15, 3, v53 ; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -12880,53 +12870,53 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v16, 3, v17 ; GFX9-NEXT: v_or_b32_sdwa v16, v60, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 ; GFX9-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -13875,24 +13865,25 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v28 -; SI-NEXT: v_mov_b32_e32 v38, v26 -; SI-NEXT: v_mov_b32_e32 v49, v24 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v50, v24 ; SI-NEXT: v_mov_b32_e32 v51, v14 ; SI-NEXT: v_mov_b32_e32 v54, v12 ; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v44, v6 -; SI-NEXT: v_mov_b32_e32 v33, v4 -; SI-NEXT: v_mov_b32_e32 v32, v2 -; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: v_mov_b32_e32 v31, v8 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v4 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 @@ -13904,8 +13895,9 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v5 @@ -13918,43 +13910,48 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 -; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v6 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v10 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v14 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v26 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v61, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 ; SI-NEXT: v_or_b32_e32 v0, v0, v60 @@ -13975,135 +13972,163 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 ; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 -; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 -; SI-NEXT: v_or_b32_e32 v0, v0, v23 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 -; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: v_mov_b32_e32 v26, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v0, v0, v29 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 -; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_or_b32_e32 v0, v0, v15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: s_or_b32 s8, s4, s5 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: v_or_b32_e32 v2, v2, v62 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v61, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_mov_b32_e32 v24, v43 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_cbranch_execnz .LBB27_3 +; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s22, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s26, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v52, v42 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_cbranch_execnz .LBB27_3 -; SI-NEXT: .LBB27_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v63, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v31 ; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -14144,85 +14169,30 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 ; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v48 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s20, 0xff -; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s24, 0xff -; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s26, 0xff -; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_add_i32 s4, s4, 0x3000000 -; SI-NEXT: s_add_i32 s5, s5, 0x3000000 -; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 ; SI-NEXT: v_or_b32_e32 v0, v21, v0 @@ -14232,44 +14202,59 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v26, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v43 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -14294,9 +14279,8 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB27_4: -; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v43 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB27_2 ; @@ -14322,33 +14306,34 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v36, v28 ; VI-NEXT: v_mov_b32_e32 v35, v26 ; VI-NEXT: v_mov_b32_e32 v34, v24 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: v_mov_b32_e32 v48, v12 -; VI-NEXT: v_mov_b32_e32 v49, v10 -; VI-NEXT: v_mov_b32_e32 v50, v8 -; VI-NEXT: v_mov_b32_e32 v51, v6 -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v45, v0 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v39, v12 +; VI-NEXT: v_mov_b32_e32 v48, v10 +; VI-NEXT: v_mov_b32_e32 v49, v8 +; VI-NEXT: v_mov_b32_e32 v50, v6 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v44, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:68 ; VI-NEXT: v_mov_b32_e32 v37, v30 ; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 @@ -14362,147 +14347,179 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v4 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB27_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: v_or_b32_sdwa v0, v53, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v51, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v50, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_and_b32 s4, s20, 0xff +; VI-NEXT: s_lshl_b32 s5, s21, 8 ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v33, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v34, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s24, 0xff +; VI-NEXT: s_lshl_b32 s5, s25, 8 ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s8 ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s8, s4, s5 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v52, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v42, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v44 ; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_cbranch_execnz .LBB27_3 +; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff ; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff ; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v28, v44 -; VI-NEXT: v_mov_b32_e32 v33, v42 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_cbranch_execnz .LBB27_3 -; VI-NEXT: .LBB27_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v28 -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v45 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v43 +; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 ; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 ; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 @@ -14520,7 +14537,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 @@ -14532,90 +14549,50 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v31 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s4, s16, 0xff -; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_addk_i32 s5, 0x300 -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s6, s24, 0xff -; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_or_b32 s7, s8, s7 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v40 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 -; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v42 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB27_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -14636,9 +14613,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB27_4: -; VI-NEXT: v_mov_b32_e32 v28, v44 -; VI-NEXT: v_mov_b32_e32 v26, v4 -; VI-NEXT: v_mov_b32_e32 v33, v42 +; VI-NEXT: v_mov_b32_e32 v43, v44 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB27_2 ; @@ -14664,33 +14639,34 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v36, v28 ; GFX9-NEXT: v_mov_b32_e32 v35, v26 ; GFX9-NEXT: v_mov_b32_e32 v34, v24 -; GFX9-NEXT: v_mov_b32_e32 v39, v14 -; GFX9-NEXT: v_mov_b32_e32 v48, v12 -; GFX9-NEXT: v_mov_b32_e32 v49, v10 -; GFX9-NEXT: v_mov_b32_e32 v50, v8 -; GFX9-NEXT: v_mov_b32_e32 v51, v6 -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v45, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v39, v12 +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_mov_b32_e32 v49, v8 +; GFX9-NEXT: v_mov_b32_e32 v50, v6 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v44, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:68 ; GFX9-NEXT: v_mov_b32_e32 v37, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 @@ -14704,7 +14680,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v25 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v27 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(19) @@ -14712,256 +14688,248 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v4 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v24 ; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s21, 8 ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s7, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s8 ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s8, s4, s5 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v43, v44 ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB27_3 +; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s19, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s23, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v28, v44 -; GFX9-NEXT: v_mov_b32_e32 v33, v42 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_cbranch_execnz .LBB27_3 -; GFX9-NEXT: .LBB27_2: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v1, 3, v28 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v26 -; GFX9-NEXT: s_movk_i32 s4, 0x300 -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 ; GFX9-NEXT: s_add_i32 s28, s28, 3 -; GFX9-NEXT: s_and_b32 s5, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s29, 8 -; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 -; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v51 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v50 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 ; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 ; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v35 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 ; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v40 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v31 ; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_and_b32 s5, s16, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s17, 8 -; GFX9-NEXT: s_add_i32 s18, s18, 3 -; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: s_and_b32 s6, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s19, 8 -; GFX9-NEXT: s_or_b32 s6, s7, s6 -; GFX9-NEXT: s_addk_i32 s5, 0x300 -; GFX9-NEXT: s_addk_i32 s6, 0x300 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_add_i32 s20, s20, 3 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s21, 8 -; GFX9-NEXT: s_add_i32 s22, s22, 3 -; GFX9-NEXT: s_or_b32 s6, s7, s6 -; GFX9-NEXT: s_and_b32 s7, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s23, 8 -; GFX9-NEXT: s_or_b32 s7, s8, s7 -; GFX9-NEXT: s_addk_i32 s6, 0x300 -; GFX9-NEXT: s_addk_i32 s7, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_add_i32 s24, s24, 3 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s25, 8 -; GFX9-NEXT: s_add_i32 s26, s26, 3 -; GFX9-NEXT: s_or_b32 s7, s8, s7 -; GFX9-NEXT: s_and_b32 s8, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s27, 8 -; GFX9-NEXT: s_or_b32 s8, s9, s8 -; GFX9-NEXT: s_addk_i32 s7, 0x300 -; GFX9-NEXT: s_addk_i32 s8, 0x300 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v52 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: .LBB27_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -14982,9 +14950,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB27_4: -; GFX9-NEXT: v_mov_b32_e32 v28, v44 -; GFX9-NEXT: v_mov_b32_e32 v26, v4 -; GFX9-NEXT: v_mov_b32_e32 v33, v42 +; GFX9-NEXT: v_mov_b32_e32 v43, v44 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB27_2 ; @@ -17238,88 +17204,88 @@ define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a, ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_mov_b32_e32 v33, v1 ; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: v_mov_b32_e32 v34, s16 -; SI-NEXT: v_mov_b32_e32 v35, s17 -; SI-NEXT: v_mov_b32_e32 v36, s18 -; SI-NEXT: v_mov_b32_e32 v37, s19 -; SI-NEXT: v_mov_b32_e32 v38, s20 -; SI-NEXT: v_mov_b32_e32 v39, s21 +; SI-NEXT: v_mov_b32_e32 v54, s16 +; SI-NEXT: v_mov_b32_e32 v55, s17 +; SI-NEXT: v_mov_b32_e32 v52, s18 +; SI-NEXT: v_mov_b32_e32 v53, s19 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v51, s21 ; SI-NEXT: v_mov_b32_e32 v48, s22 ; SI-NEXT: v_mov_b32_e32 v49, s23 -; SI-NEXT: v_mov_b32_e32 v50, s24 -; SI-NEXT: v_mov_b32_e32 v51, s25 -; SI-NEXT: v_mov_b32_e32 v52, s26 -; SI-NEXT: v_mov_b32_e32 v53, s27 -; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: v_mov_b32_e32 v38, s24 +; SI-NEXT: v_mov_b32_e32 v39, s25 +; SI-NEXT: v_mov_b32_e32 v36, s26 +; SI-NEXT: v_mov_b32_e32 v37, s27 +; SI-NEXT: v_mov_b32_e32 v34, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v55, s29 +; SI-NEXT: v_mov_b32_e32 v35, s29 ; SI-NEXT: s_cbranch_scc0 .LBB37_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v39 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 ; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[38:39], 16 ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 ; SI-NEXT: s_cbranch_execnz .LBB37_3 ; SI-NEXT: .LBB37_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v34, 1.0, v34 -; SI-NEXT: v_add_f32_e32 v37, 1.0, v37 -; SI-NEXT: v_add_f32_e32 v36, 1.0, v36 -; SI-NEXT: v_add_f32_e32 v39, 1.0, v39 -; SI-NEXT: v_add_f32_e32 v38, 1.0, v38 -; SI-NEXT: v_add_f32_e32 v49, 1.0, v49 -; SI-NEXT: v_add_f32_e32 v48, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v51, 1.0, v51 -; SI-NEXT: v_add_f32_e32 v50, 1.0, v50 -; SI-NEXT: v_add_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_add_f32_e32 v52, 1.0, v52 ; SI-NEXT: v_add_f32_e32 v55, 1.0, v55 ; SI-NEXT: v_add_f32_e32 v54, 1.0, v54 +; SI-NEXT: v_add_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v51, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v50, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v49, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v48, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v38, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v37, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v36, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v34, 1.0, v34 ; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 ; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[38:39], 16 ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v39 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 ; SI-NEXT: .LBB37_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v34 -; SI-NEXT: v_mov_b32_e32 v2, v35 -; SI-NEXT: v_mov_b32_e32 v4, v36 -; SI-NEXT: v_mov_b32_e32 v6, v37 -; SI-NEXT: v_mov_b32_e32 v8, v38 -; SI-NEXT: v_mov_b32_e32 v10, v39 +; SI-NEXT: v_mov_b32_e32 v0, v54 +; SI-NEXT: v_mov_b32_e32 v2, v55 +; SI-NEXT: v_mov_b32_e32 v4, v52 +; SI-NEXT: v_mov_b32_e32 v6, v53 +; SI-NEXT: v_mov_b32_e32 v8, v50 +; SI-NEXT: v_mov_b32_e32 v10, v51 ; SI-NEXT: v_mov_b32_e32 v12, v48 ; SI-NEXT: v_mov_b32_e32 v14, v49 -; SI-NEXT: v_mov_b32_e32 v16, v50 -; SI-NEXT: v_mov_b32_e32 v18, v51 -; SI-NEXT: v_mov_b32_e32 v20, v52 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v26, v55 +; SI-NEXT: v_mov_b32_e32 v16, v38 +; SI-NEXT: v_mov_b32_e32 v18, v39 +; SI-NEXT: v_mov_b32_e32 v20, v36 +; SI-NEXT: v_mov_b32_e32 v22, v37 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v26, v35 ; SI-NEXT: v_mov_b32_e32 v28, v32 ; SI-NEXT: v_mov_b32_e32 v30, v33 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -17336,10 +17302,10 @@ define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_branch .LBB37_2 ; ; VI-LABEL: bitcast_v16f32_to_v32i16_scalar: @@ -17845,40 +17811,40 @@ define inreg <16 x float> @bitcast_v32i16_to_v16f32_scalar(<32 x i16> inreg %a, ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: v_or_b32_e32 v7, v0, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v8, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v9, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v10, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v11, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v12, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v13, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v32 ; SI-NEXT: v_or_b32_e32 v15, v0, v17 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -18968,41 +18934,41 @@ define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 @@ -20331,18 +20297,18 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -21622,53 +21588,53 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s16 ; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 -; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 -; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 -; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v58 +; SI-NEXT: v_lshr_b64 v[0:1], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v63 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v62 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v61 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v60 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v59 ; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 ; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 @@ -21686,46 +21652,46 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 ; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_mov_b32_e32 v14, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v15, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 ; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 ; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -21818,8 +21784,8 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-LABEL: bitcast_v32bf16_to_v16f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v10, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -21832,11 +21798,11 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB47_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB47_3 @@ -23609,9 +23575,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -23628,108 +23592,107 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr61 ; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; kill: killed $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB48_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] -; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v16 ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; VI-NEXT: v_mov_b32_e32 v26, v22 -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; VI-NEXT: .LBB48_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB48_4 @@ -23745,176 +23708,173 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] ; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v25, 24, v16 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; VI-NEXT: .LBB48_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v27, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v28 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v26, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; VI-NEXT: v_or_b32_sdwa v2, v2, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 ; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v21 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 ; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v36 ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v30 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 ; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 -; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -23940,9 +23900,7 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; kill: killed $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -23959,108 +23917,107 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr27 -; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB48_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v16 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v16 ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v26, v23 -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX9-NEXT: .LBB48_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB48_4 @@ -24079,158 +24036,155 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 ; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 ; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] ; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 ; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] -; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 24, v16 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX9-NEXT: .LBB48_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v27, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v26, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v46 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v42 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21 ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54 ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v49 ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v48 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19 ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v31 ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v30 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 ; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v26 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -24918,12 +24872,12 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[22:23], s[92:93], 24 ; SI-NEXT: s_lshr_b64 s[24:25], s[92:93], 16 ; SI-NEXT: s_lshr_b64 s[26:27], s[92:93], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[94:95], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[94:95], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[94:95], 8 -; SI-NEXT: s_lshr_b64 s[28:29], s[30:31], 24 -; SI-NEXT: s_lshr_b64 s[40:41], s[30:31], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[30:31], 8 +; SI-NEXT: s_lshr_b64 s[28:29], s[94:95], 24 +; SI-NEXT: s_lshr_b64 s[40:41], s[94:95], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[94:95], 8 +; SI-NEXT: s_lshr_b64 s[44:45], s[30:31], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[30:31], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[30:31], 8 ; SI-NEXT: s_lshr_b64 s[58:59], s[34:35], 24 ; SI-NEXT: s_lshr_b64 s[60:61], s[34:35], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[34:35], 8 @@ -24952,10 +24906,10 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s77, v20 ; SI-NEXT: v_readfirstlane_b32 s62, v18 ; SI-NEXT: v_readfirstlane_b32 s63, v16 -; SI-NEXT: v_readfirstlane_b32 s44, v15 -; SI-NEXT: v_readfirstlane_b32 s45, v11 -; SI-NEXT: v_readfirstlane_b32 s28, v10 -; SI-NEXT: v_readfirstlane_b32 s29, v9 +; SI-NEXT: v_readfirstlane_b32 s56, v15 +; SI-NEXT: v_readfirstlane_b32 s57, v11 +; SI-NEXT: v_readfirstlane_b32 s42, v10 +; SI-NEXT: v_readfirstlane_b32 s43, v9 ; SI-NEXT: v_readfirstlane_b32 s26, v8 ; SI-NEXT: v_readfirstlane_b32 s27, v7 ; SI-NEXT: v_readfirstlane_b32 s20, v6 @@ -24976,12 +24930,12 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[22:23], s[26:27], 24 ; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 ; SI-NEXT: s_lshr_b64 s[26:27], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[28:29], 8 -; SI-NEXT: s_lshr_b64 s[28:29], s[44:45], 24 -; SI-NEXT: s_lshr_b64 s[40:41], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[44:45], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[28:29], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[44:45], s[56:57], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[56:57], 8 ; SI-NEXT: s_lshr_b64 s[58:59], s[62:63], 24 ; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[62:63], 8 @@ -25026,42 +24980,42 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr83 -; SI-NEXT: ; implicit-def: $sgpr81 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $sgpr85 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 ; SI-NEXT: s_branch .LBB49_2 ; SI-NEXT: .LBB49_4: ; SI-NEXT: v_mov_b32_e32 v22, s36 @@ -25145,15 +25099,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v16, v16, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v36 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_lshl_b32 s5, s56, 8 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v35 ; SI-NEXT: v_or_b32_e32 v15, s5, v15 -; SI-NEXT: s_and_b32 s5, s40, 0xff +; SI-NEXT: s_and_b32 s5, s46, 0xff ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v18, v20, v18 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s28, 24 +; SI-NEXT: s_lshl_b32 s7, s44, 24 ; SI-NEXT: v_or_b32_e32 v16, v16, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 @@ -25169,15 +25123,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v11, v11, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v33 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: s_lshl_b32 s5, s56, 8 +; SI-NEXT: s_lshl_b32 s5, s42, 8 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v32 ; SI-NEXT: v_or_b32_e32 v10, s5, v10 -; SI-NEXT: s_and_b32 s5, s46, 0xff +; SI-NEXT: s_and_b32 s5, s40, 0xff ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s42, 24 +; SI-NEXT: s_lshl_b32 s7, s28, 24 ; SI-NEXT: v_or_b32_e32 v11, v11, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 @@ -25335,7 +25289,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v63, s30, 0 ; VI-NEXT: v_writelane_b32 v63, s31, 1 @@ -25456,36 +25410,35 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; VI-NEXT: s_cbranch_execnz .LBB49_4 ; VI-NEXT: .LBB49_2: ; %cmp.true +; VI-NEXT: v_add_f32_e64 v4, s7, 1.0 +; VI-NEXT: v_add_f32_e64 v3, s6, 1.0 ; VI-NEXT: v_add_f32_e64 v6, s9, 1.0 ; VI-NEXT: v_add_f32_e64 v5, s8, 1.0 -; VI-NEXT: v_add_f32_e64 v2, s5, 1.0 -; VI-NEXT: v_add_f32_e64 v1, s4, 1.0 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[3:4] ; VI-NEXT: v_add_f32_e64 v8, s11, 1.0 ; VI-NEXT: v_add_f32_e64 v7, s10, 1.0 -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[5:6] ; VI-NEXT: v_add_f32_e64 v10, s13, 1.0 ; VI-NEXT: v_add_f32_e64 v9, s12, 1.0 +; VI-NEXT: v_add_f32_e64 v2, s5, 1.0 +; VI-NEXT: v_add_f32_e64 v1, s4, 1.0 ; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] ; VI-NEXT: v_add_f32_e64 v12, s15, 1.0 ; VI-NEXT: v_add_f32_e64 v11, s14, 1.0 -; VI-NEXT: v_add_f32_e64 v4, s7, 1.0 -; VI-NEXT: v_add_f32_e64 v3, s6, 1.0 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[9:10] -; VI-NEXT: v_add_f32_e64 v16, s17, 1.0 -; VI-NEXT: v_add_f32_e64 v15, s16, 1.0 -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] +; VI-NEXT: v_add_f32_e64 v14, s17, 1.0 +; VI-NEXT: v_add_f32_e64 v13, s16, 1.0 ; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] ; VI-NEXT: v_add_f32_e64 v18, s19, 1.0 ; VI-NEXT: v_add_f32_e64 v17, s18, 1.0 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[15:16] -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] ; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[17:18] +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 @@ -25514,16 +25467,16 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v12 ; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v16 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v13 ; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v18 ; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v18 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v17 ; VI-NEXT: s_branch .LBB49_5 ; VI-NEXT: .LBB49_3: ; VI-NEXT: ; implicit-def: $sgpr67 @@ -25576,14 +25529,15 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: s_branch .LBB49_2 ; VI-NEXT: .LBB49_4: -; VI-NEXT: v_mov_b32_e32 v20, s44 -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v20, s42 +; VI-NEXT: v_mov_b32_e32 v19, s57 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v19, s44 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v17, s18 ; VI-NEXT: v_mov_b32_e32 v18, s19 -; VI-NEXT: v_mov_b32_e32 v15, s16 -; VI-NEXT: v_mov_b32_e32 v16, s17 +; VI-NEXT: v_mov_b32_e32 v13, s16 +; VI-NEXT: v_mov_b32_e32 v14, s17 ; VI-NEXT: v_mov_b32_e32 v11, s14 ; VI-NEXT: v_mov_b32_e32 v12, s15 ; VI-NEXT: v_mov_b32_e32 v9, s12 @@ -25596,13 +25550,13 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_mov_b32_e32 v19, s67 -; VI-NEXT: v_mov_b32_e32 v62, s65 -; VI-NEXT: v_mov_b32_e32 v13, s66 +; VI-NEXT: v_mov_b32_e32 v16, s67 +; VI-NEXT: v_mov_b32_e32 v15, s65 +; VI-NEXT: v_mov_b32_e32 v62, s66 ; VI-NEXT: v_mov_b32_e32 v60, s64 ; VI-NEXT: v_mov_b32_e32 v61, s55 -; VI-NEXT: v_mov_b32_e32 v58, s54 -; VI-NEXT: v_mov_b32_e32 v59, s52 +; VI-NEXT: v_mov_b32_e32 v59, s54 +; VI-NEXT: v_mov_b32_e32 v58, s52 ; VI-NEXT: v_mov_b32_e32 v57, s53 ; VI-NEXT: v_mov_b32_e32 v47, s51 ; VI-NEXT: v_mov_b32_e32 v56, s50 @@ -25634,44 +25588,42 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v29, s60 ; VI-NEXT: v_mov_b32_e32 v28, s58 ; VI-NEXT: v_mov_b32_e32 v27, s59 -; VI-NEXT: v_mov_b32_e32 v14, s57 ; VI-NEXT: v_mov_b32_e32 v26, s56 +; VI-NEXT: v_mov_b32_e32 v19, s42 +; VI-NEXT: v_mov_b32_e32 v20, s40 +; VI-NEXT: v_mov_b32_e32 v21, s28 ; VI-NEXT: v_mov_b32_e32 v22, s26 ; VI-NEXT: v_mov_b32_e32 v23, s24 ; VI-NEXT: v_mov_b32_e32 v24, s22 ; VI-NEXT: v_mov_b32_e32 v25, s20 -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v20, s40 -; VI-NEXT: v_mov_b32_e32 v21, s28 ; VI-NEXT: .LBB49_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; VI-NEXT: v_or_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v25 -; VI-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v18, v62, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v61 -; VI-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v17, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v13, v17, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v24 -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v58 -; VI-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; VI-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v62 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v61 +; VI-NEXT: v_or_b32_sdwa v15, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v60, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v16, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v59 +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; VI-NEXT: v_or_b32_sdwa v15, v58, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v15, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v57 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v56 -; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v15, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v56 +; VI-NEXT: v_or_b32_sdwa v14, v47, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v14, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v46 ; VI-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v23 @@ -25730,8 +25682,22 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v34 ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; VI-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: v_readlane_b32 s67, v63, 19 ; VI-NEXT: v_readlane_b32 s66, v63, 18 ; VI-NEXT: v_readlane_b32 s65, v63, 17 @@ -25753,34 +25719,19 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; VI-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v5, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 -; VI-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v4, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; VI-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v3, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; VI-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -25798,7 +25749,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -25807,7 +25758,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v63, s30, 0 ; GFX9-NEXT: v_writelane_b32 v63, s31, 1 @@ -25924,38 +25875,36 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB49_4 ; GFX9-NEXT: .LBB49_2: ; %cmp.true -; GFX9-NEXT: v_add_f32_e64 v6, s9, 1.0 -; GFX9-NEXT: v_add_f32_e64 v5, s8, 1.0 ; GFX9-NEXT: v_add_f32_e64 v2, s5, 1.0 ; GFX9-NEXT: v_add_f32_e64 v1, s4, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] +; GFX9-NEXT: v_add_f32_e64 v4, s7, 1.0 +; GFX9-NEXT: v_add_f32_e64 v3, s6, 1.0 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_f32_e64 v6, s9, 1.0 +; GFX9-NEXT: v_add_f32_e64 v5, s8, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] ; GFX9-NEXT: v_add_f32_e64 v8, s11, 1.0 ; GFX9-NEXT: v_add_f32_e64 v7, s10, 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] ; GFX9-NEXT: v_add_f32_e64 v10, s13, 1.0 ; GFX9-NEXT: v_add_f32_e64 v9, s12, 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[7:8] ; GFX9-NEXT: v_add_f32_e64 v12, s15, 1.0 ; GFX9-NEXT: v_add_f32_e64 v11, s14, 1.0 -; GFX9-NEXT: v_add_f32_e64 v4, s7, 1.0 -; GFX9-NEXT: v_add_f32_e64 v3, s6, 1.0 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] -; GFX9-NEXT: v_add_f32_e64 v16, s17, 1.0 -; GFX9-NEXT: v_add_f32_e64 v15, s16, 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] -; GFX9-NEXT: v_add_f32_e64 v20, s19, 1.0 -; GFX9-NEXT: v_add_f32_e64 v19, s18, 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] +; GFX9-NEXT: v_add_f32_e64 v14, s17, 1.0 +; GFX9-NEXT: v_add_f32_e64 v13, s16, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[11:12] +; GFX9-NEXT: v_add_f32_e64 v16, s19, 1.0 +; GFX9-NEXT: v_add_f32_e64 v15, s18, 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v2 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 @@ -25984,16 +25933,16 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v15 ; GFX9-NEXT: s_branch .LBB49_5 ; GFX9-NEXT: .LBB49_3: ; GFX9-NEXT: ; implicit-def: $sgpr55 @@ -26046,15 +25995,16 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: s_branch .LBB49_2 ; GFX9-NEXT: .LBB49_4: -; GFX9-NEXT: v_mov_b32_e32 v21, s44 -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v18, s57 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v18, s44 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s42 -; GFX9-NEXT: v_mov_b32_e32 v19, s18 -; GFX9-NEXT: v_mov_b32_e32 v20, s19 -; GFX9-NEXT: v_mov_b32_e32 v15, s16 -; GFX9-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v14, s17 ; GFX9-NEXT: v_mov_b32_e32 v11, s14 ; GFX9-NEXT: v_mov_b32_e32 v12, s15 ; GFX9-NEXT: v_mov_b32_e32 v9, s12 @@ -26067,13 +26017,13 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v17, s55 -; GFX9-NEXT: v_mov_b32_e32 v62, s53 -; GFX9-NEXT: v_mov_b32_e32 v13, s54 +; GFX9-NEXT: v_mov_b32_e32 v25, s55 +; GFX9-NEXT: v_mov_b32_e32 v17, s53 +; GFX9-NEXT: v_mov_b32_e32 v62, s54 ; GFX9-NEXT: v_mov_b32_e32 v60, s52 ; GFX9-NEXT: v_mov_b32_e32 v61, s51 -; GFX9-NEXT: v_mov_b32_e32 v58, s50 -; GFX9-NEXT: v_mov_b32_e32 v59, s48 +; GFX9-NEXT: v_mov_b32_e32 v59, s50 +; GFX9-NEXT: v_mov_b32_e32 v58, s48 ; GFX9-NEXT: v_mov_b32_e32 v57, s49 ; GFX9-NEXT: v_mov_b32_e32 v47, s39 ; GFX9-NEXT: v_mov_b32_e32 v56, s38 @@ -26105,45 +26055,42 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v29, s60 ; GFX9-NEXT: v_mov_b32_e32 v28, s58 ; GFX9-NEXT: v_mov_b32_e32 v27, s59 -; GFX9-NEXT: v_mov_b32_e32 v14, s57 -; GFX9-NEXT: v_mov_b32_e32 v18, s56 -; GFX9-NEXT: v_mov_b32_e32 v23, s26 -; GFX9-NEXT: v_mov_b32_e32 v24, s24 -; GFX9-NEXT: v_mov_b32_e32 v25, s22 -; GFX9-NEXT: v_mov_b32_e32 v26, s20 -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s40 -; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_mov_b32_e32 v26, s56 +; GFX9-NEXT: v_mov_b32_e32 v18, s42 +; GFX9-NEXT: v_mov_b32_e32 v19, s40 +; GFX9-NEXT: v_mov_b32_e32 v20, s28 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v22, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s22 +; GFX9-NEXT: v_mov_b32_e32 v24, s20 ; GFX9-NEXT: .LBB49_5: ; %end -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v26 -; GFX9-NEXT: v_or_b32_sdwa v19, v62, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v13, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v58 -; GFX9-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v16, v60, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v15, v58, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v57 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v56 -; GFX9-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v14, v47, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v46 ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:16 @@ -26155,7 +26102,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v41 ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v21 ; GFX9-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 @@ -26167,7 +26114,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v52 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v20 ; GFX9-NEXT: v_or_b32_sdwa v9, v51, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:32 @@ -26179,7 +26126,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v19 ; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:40 @@ -26191,8 +26138,20 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s55, v63, 15 ; GFX9-NEXT: v_readlane_b32 s54, v63, 14 ; GFX9-NEXT: v_readlane_b32 s53, v63, 13 @@ -26210,29 +26169,16 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v32 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v31 -; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v29 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload @@ -26251,7 +26197,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -26686,19 +26632,19 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 @@ -26713,24 +26659,24 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v17 ; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 ; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v13 ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v17 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v2 ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v4 ; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v6 ; SI-NEXT: s_waitcnt vmcnt(13) @@ -26748,9 +26694,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v53 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v55 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v40 ; SI-NEXT: s_waitcnt vmcnt(3) @@ -26766,47 +26712,43 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v47 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 ; SI-NEXT: v_or_b32_e32 v0, v0, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v57 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v62, v9 @@ -26821,16 +26763,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v12, v54, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr52 @@ -26840,67 +26779,79 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v16, v4 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v18 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -26947,10 +26898,10 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v40 -; SI-NEXT: v_or_b32_e32 v13, v13, v19 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v21 ; SI-NEXT: v_or_b32_e32 v14, v14, v25 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 @@ -26961,24 +26912,19 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v23 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 @@ -27005,19 +26951,20 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: .LBB50_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -27031,17 +26978,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v35, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -27061,75 +27004,85 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_mov_b32 s7, 0x3000000 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v16, v4 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_or_b32_e32 v4, v18, v4 @@ -27195,11 +27148,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v13, v19, v13 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v21 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_or_b32_e32 v14, v25, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 @@ -27217,15 +27170,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v23, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 ; SI-NEXT: .LBB50_4: ; %end @@ -27293,7 +27240,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:64 ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 @@ -27303,7 +27250,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 ; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 @@ -27333,7 +27280,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v26 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v38 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v30 ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v39 ; VI-NEXT: s_waitcnt vmcnt(7) @@ -27356,7 +27303,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 @@ -27372,20 +27319,18 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v30, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr53 @@ -27396,6 +27341,8 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -27405,43 +27352,43 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v2, v2, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -27537,10 +27484,8 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -27550,7 +27495,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v12, 3, v38 +; VI-NEXT: v_add_u16_e32 v12, 3, v30 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v13, 3, v58 @@ -27561,6 +27506,8 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -27570,56 +27517,56 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_add_u16_e32 v1, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 -; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v2, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v4, v4, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v5, v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v53 ; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v8, 3, v8 ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_sdwa v15, v17, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 -; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v15, v17, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -27747,7 +27694,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 @@ -27757,7 +27704,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 ; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 @@ -27791,7 +27738,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v26 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v38 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v30 ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v39 ; GFX9-NEXT: s_waitcnt vmcnt(7) @@ -27814,7 +27761,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 @@ -27830,20 +27777,18 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v30, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr53 @@ -27854,6 +27799,8 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -27863,43 +27810,43 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -27995,10 +27942,8 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -28008,7 +27953,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v13, 3, v58 @@ -28020,6 +27965,8 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v15, 3, v53 ; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -28029,53 +27976,53 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v16, 3, v17 ; GFX9-NEXT: v_or_b32_sdwa v16, v60, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 ; GFX9-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -29024,24 +28971,25 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v28 -; SI-NEXT: v_mov_b32_e32 v38, v26 -; SI-NEXT: v_mov_b32_e32 v49, v24 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v50, v24 ; SI-NEXT: v_mov_b32_e32 v51, v14 ; SI-NEXT: v_mov_b32_e32 v54, v12 ; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v44, v6 -; SI-NEXT: v_mov_b32_e32 v33, v4 -; SI-NEXT: v_mov_b32_e32 v32, v2 -; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: v_mov_b32_e32 v31, v8 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v4 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 @@ -29053,8 +29001,9 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v5 @@ -29067,43 +29016,48 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 -; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v6 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v10 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v14 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v26 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v61, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 ; SI-NEXT: v_or_b32_e32 v0, v0, v60 @@ -29124,135 +29078,163 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 ; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 -; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 -; SI-NEXT: v_or_b32_e32 v0, v0, v23 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 -; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: v_mov_b32_e32 v26, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v0, v0, v29 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 -; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_or_b32_e32 v0, v0, v15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: s_or_b32 s8, s4, s5 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: v_or_b32_e32 v2, v2, v62 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v61, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_mov_b32_e32 v24, v43 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_cbranch_execnz .LBB51_3 +; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s22, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s26, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v52, v42 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_cbranch_execnz .LBB51_3 -; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v63, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v31 ; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -29293,85 +29275,30 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 ; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v48 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s20, 0xff -; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s24, 0xff -; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s26, 0xff -; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_add_i32 s4, s4, 0x3000000 -; SI-NEXT: s_add_i32 s5, s5, 0x3000000 -; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 ; SI-NEXT: v_or_b32_e32 v0, v21, v0 @@ -29381,44 +29308,59 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v26, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v43 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -29443,9 +29385,8 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v43 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB51_2 ; @@ -29471,33 +29412,34 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v36, v28 ; VI-NEXT: v_mov_b32_e32 v35, v26 ; VI-NEXT: v_mov_b32_e32 v34, v24 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: v_mov_b32_e32 v48, v12 -; VI-NEXT: v_mov_b32_e32 v49, v10 -; VI-NEXT: v_mov_b32_e32 v50, v8 -; VI-NEXT: v_mov_b32_e32 v51, v6 -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v45, v0 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v39, v12 +; VI-NEXT: v_mov_b32_e32 v48, v10 +; VI-NEXT: v_mov_b32_e32 v49, v8 +; VI-NEXT: v_mov_b32_e32 v50, v6 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v44, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:68 ; VI-NEXT: v_mov_b32_e32 v37, v30 ; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 @@ -29511,147 +29453,179 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v4 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: v_or_b32_sdwa v0, v53, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v51, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v50, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_and_b32 s4, s20, 0xff +; VI-NEXT: s_lshl_b32 s5, s21, 8 ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v33, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v34, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s24, 0xff +; VI-NEXT: s_lshl_b32 s5, s25, 8 ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s8 ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s8, s4, s5 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v52, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v42, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v44 ; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_cbranch_execnz .LBB51_3 +; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff ; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff ; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v28, v44 -; VI-NEXT: v_mov_b32_e32 v33, v42 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_cbranch_execnz .LBB51_3 -; VI-NEXT: .LBB51_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v28 -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v45 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v43 +; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 ; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 ; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 @@ -29669,7 +29643,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 @@ -29681,90 +29655,50 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v31 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s4, s16, 0xff -; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_addk_i32 s5, 0x300 -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s6, s24, 0xff -; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_or_b32 s7, s8, s7 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v40 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 -; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v42 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -29785,9 +29719,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: -; VI-NEXT: v_mov_b32_e32 v28, v44 -; VI-NEXT: v_mov_b32_e32 v26, v4 -; VI-NEXT: v_mov_b32_e32 v33, v42 +; VI-NEXT: v_mov_b32_e32 v43, v44 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB51_2 ; @@ -29813,33 +29745,34 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v36, v28 ; GFX9-NEXT: v_mov_b32_e32 v35, v26 ; GFX9-NEXT: v_mov_b32_e32 v34, v24 -; GFX9-NEXT: v_mov_b32_e32 v39, v14 -; GFX9-NEXT: v_mov_b32_e32 v48, v12 -; GFX9-NEXT: v_mov_b32_e32 v49, v10 -; GFX9-NEXT: v_mov_b32_e32 v50, v8 -; GFX9-NEXT: v_mov_b32_e32 v51, v6 -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v45, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v39, v12 +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_mov_b32_e32 v49, v8 +; GFX9-NEXT: v_mov_b32_e32 v50, v6 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v44, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:68 ; GFX9-NEXT: v_mov_b32_e32 v37, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 @@ -29853,7 +29786,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v25 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v27 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(19) @@ -29861,256 +29794,248 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v4 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v24 ; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s21, 8 ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s7, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s8 ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s8, s4, s5 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v43, v44 ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB51_3 +; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s19, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s23, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v28, v44 -; GFX9-NEXT: v_mov_b32_e32 v33, v42 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_cbranch_execnz .LBB51_3 -; GFX9-NEXT: .LBB51_2: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v1, 3, v28 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v26 -; GFX9-NEXT: s_movk_i32 s4, 0x300 -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 ; GFX9-NEXT: s_add_i32 s28, s28, 3 -; GFX9-NEXT: s_and_b32 s5, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s29, 8 -; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 -; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v51 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v50 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 ; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 ; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v35 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 ; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v40 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v31 ; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_and_b32 s5, s16, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s17, 8 -; GFX9-NEXT: s_add_i32 s18, s18, 3 -; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: s_and_b32 s6, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s19, 8 -; GFX9-NEXT: s_or_b32 s6, s7, s6 -; GFX9-NEXT: s_addk_i32 s5, 0x300 -; GFX9-NEXT: s_addk_i32 s6, 0x300 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_add_i32 s20, s20, 3 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s21, 8 -; GFX9-NEXT: s_add_i32 s22, s22, 3 -; GFX9-NEXT: s_or_b32 s6, s7, s6 -; GFX9-NEXT: s_and_b32 s7, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s23, 8 -; GFX9-NEXT: s_or_b32 s7, s8, s7 -; GFX9-NEXT: s_addk_i32 s6, 0x300 -; GFX9-NEXT: s_addk_i32 s7, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_add_i32 s24, s24, 3 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s25, 8 -; GFX9-NEXT: s_add_i32 s26, s26, 3 -; GFX9-NEXT: s_or_b32 s7, s8, s7 -; GFX9-NEXT: s_and_b32 s8, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s27, 8 -; GFX9-NEXT: s_or_b32 s8, s9, s8 -; GFX9-NEXT: s_addk_i32 s7, 0x300 -; GFX9-NEXT: s_addk_i32 s8, 0x300 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v52 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: .LBB51_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -30131,9 +30056,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB51_4: -; GFX9-NEXT: v_mov_b32_e32 v28, v44 -; GFX9-NEXT: v_mov_b32_e32 v26, v4 -; GFX9-NEXT: v_mov_b32_e32 v33, v42 +; GFX9-NEXT: v_mov_b32_e32 v43, v44 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB51_2 ; @@ -31742,88 +31665,88 @@ define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_mov_b32_e32 v33, v1 ; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: v_mov_b32_e32 v34, s16 -; SI-NEXT: v_mov_b32_e32 v35, s17 -; SI-NEXT: v_mov_b32_e32 v36, s18 -; SI-NEXT: v_mov_b32_e32 v37, s19 -; SI-NEXT: v_mov_b32_e32 v38, s20 -; SI-NEXT: v_mov_b32_e32 v39, s21 +; SI-NEXT: v_mov_b32_e32 v54, s16 +; SI-NEXT: v_mov_b32_e32 v55, s17 +; SI-NEXT: v_mov_b32_e32 v52, s18 +; SI-NEXT: v_mov_b32_e32 v53, s19 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v51, s21 ; SI-NEXT: v_mov_b32_e32 v48, s22 ; SI-NEXT: v_mov_b32_e32 v49, s23 -; SI-NEXT: v_mov_b32_e32 v50, s24 -; SI-NEXT: v_mov_b32_e32 v51, s25 -; SI-NEXT: v_mov_b32_e32 v52, s26 -; SI-NEXT: v_mov_b32_e32 v53, s27 -; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: v_mov_b32_e32 v38, s24 +; SI-NEXT: v_mov_b32_e32 v39, s25 +; SI-NEXT: v_mov_b32_e32 v36, s26 +; SI-NEXT: v_mov_b32_e32 v37, s27 +; SI-NEXT: v_mov_b32_e32 v34, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v55, s29 +; SI-NEXT: v_mov_b32_e32 v35, s29 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v39 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 ; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[38:39], 16 ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_addc_u32_e32 v33, vcc, 0, v33, vcc -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 -; SI-NEXT: v_addc_u32_e32 v55, vcc, 0, v55, vcc -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 -; SI-NEXT: v_addc_u32_e32 v53, vcc, 0, v53, vcc -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 -; SI-NEXT: v_addc_u32_e32 v51, vcc, 0, v51, vcc -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: v_addc_u32_e32 v49, vcc, 0, v49, vcc -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 -; SI-NEXT: v_addc_u32_e32 v39, vcc, 0, v39, vcc -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: v_addc_u32_e32 v37, vcc, 0, v37, vcc ; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 ; SI-NEXT: v_addc_u32_e32 v35, vcc, 0, v35, vcc +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_addc_u32_e32 v37, vcc, 0, v37, vcc +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_addc_u32_e32 v39, vcc, 0, v39, vcc +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: v_addc_u32_e32 v49, vcc, 0, v49, vcc +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; SI-NEXT: v_addc_u32_e32 v51, vcc, 0, v51, vcc +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; SI-NEXT: v_addc_u32_e32 v53, vcc, 0, v53, vcc +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; SI-NEXT: v_addc_u32_e32 v55, vcc, 0, v55, vcc ; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[38:39], 16 ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v39 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v34 -; SI-NEXT: v_mov_b32_e32 v2, v35 -; SI-NEXT: v_mov_b32_e32 v4, v36 -; SI-NEXT: v_mov_b32_e32 v6, v37 -; SI-NEXT: v_mov_b32_e32 v8, v38 -; SI-NEXT: v_mov_b32_e32 v10, v39 +; SI-NEXT: v_mov_b32_e32 v0, v54 +; SI-NEXT: v_mov_b32_e32 v2, v55 +; SI-NEXT: v_mov_b32_e32 v4, v52 +; SI-NEXT: v_mov_b32_e32 v6, v53 +; SI-NEXT: v_mov_b32_e32 v8, v50 +; SI-NEXT: v_mov_b32_e32 v10, v51 ; SI-NEXT: v_mov_b32_e32 v12, v48 ; SI-NEXT: v_mov_b32_e32 v14, v49 -; SI-NEXT: v_mov_b32_e32 v16, v50 -; SI-NEXT: v_mov_b32_e32 v18, v51 -; SI-NEXT: v_mov_b32_e32 v20, v52 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v26, v55 +; SI-NEXT: v_mov_b32_e32 v16, v38 +; SI-NEXT: v_mov_b32_e32 v18, v39 +; SI-NEXT: v_mov_b32_e32 v20, v36 +; SI-NEXT: v_mov_b32_e32 v22, v37 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v26, v35 ; SI-NEXT: v_mov_b32_e32 v28, v32 ; SI-NEXT: v_mov_b32_e32 v30, v33 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -31840,10 +31763,10 @@ define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v8i64_to_v32i16_scalar: @@ -32345,40 +32268,40 @@ define inreg <8 x i64> @bitcast_v32i16_to_v8i64_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: v_or_b32_e32 v7, v0, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v8, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v9, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v10, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v11, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v12, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v13, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v32 ; SI-NEXT: v_or_b32_e32 v15, v0, v17 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -33490,41 +33413,41 @@ define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB62_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 @@ -34907,18 +34830,18 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB66_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -36198,53 +36121,53 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s16 ; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 -; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 -; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 -; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v58 +; SI-NEXT: v_lshr_b64 v[0:1], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v63 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v62 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v61 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v60 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v59 ; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 ; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 @@ -36262,46 +36185,46 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 ; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_mov_b32_e32 v14, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v15, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 ; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 ; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -36394,8 +36317,8 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-LABEL: bitcast_v32bf16_to_v8i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v10, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -36408,11 +36331,11 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB67_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB67_3 @@ -38185,9 +38108,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -38204,108 +38125,107 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr61 ; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; kill: killed $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB68_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] -; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v16 ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; VI-NEXT: v_mov_b32_e32 v26, v22 -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; VI-NEXT: .LBB68_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB68_4 @@ -38330,167 +38250,164 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v25, 24, v16 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; VI-NEXT: .LBB68_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v27, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v28 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v26, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; VI-NEXT: v_or_b32_sdwa v2, v2, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 ; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v21 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 ; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v36 ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v30 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 ; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 -; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -38516,9 +38433,7 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; kill: killed $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -38535,108 +38450,107 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr27 -; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB68_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v16 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v16 ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v26, v23 -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX9-NEXT: .LBB68_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB68_4 @@ -38662,151 +38576,148 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 24, v16 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX9-NEXT: .LBB68_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v27, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v26, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v46 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v42 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21 ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54 ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v49 ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v48 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19 ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v31 ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v30 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 ; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v26 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -39520,12 +39431,12 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 ; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[14:15], 8 ; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 ; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8 @@ -39586,12 +39497,12 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 ; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[14:15], 8 ; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 ; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8 @@ -39599,14 +39510,14 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[36:37], s[18:19], 8 ; SI-NEXT: .LBB69_3: ; %end -; SI-NEXT: s_lshl_b32 s21, s36, 8 ; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s21, s36, 8 ; SI-NEXT: s_or_b32 s18, s18, s21 ; SI-NEXT: s_and_b32 s21, s34, 0xff -; SI-NEXT: s_lshl_b32 s23, s30, 24 ; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_or_b32 s21, s23, s21 +; SI-NEXT: s_lshl_b32 s23, s30, 24 ; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s21, s23, s21 ; SI-NEXT: s_or_b32 s18, s18, s21 ; SI-NEXT: v_mov_b32_e32 v1, s18 ; SI-NEXT: s_and_b32 s18, s19, 0xff @@ -39615,45 +39526,46 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: s_and_b32 s19, s84, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s21, s83, 24 -; SI-NEXT: s_or_b32 s19, s21, s19 ; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s19, s21, s19 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: v_mov_b32_e32 v2, s18 -; SI-NEXT: s_lshl_b32 s18, s94, 8 ; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s18, s94, 8 ; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: s_and_b32 s18, s92, 0xff -; SI-NEXT: s_lshl_b32 s19, s90, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_lshl_b32 s19, s90, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: s_or_b32 s16, s16, s18 -; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s17, 0xff ; SI-NEXT: s_lshl_b32 s17, s82, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s81, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s80, 24 -; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s16, s76, 8 +; SI-NEXT: s_lshl_b32 s16, s88, 8 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: s_and_b32 s16, s72, 0xff +; SI-NEXT: s_and_b32 s16, s78, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s62, 24 -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_lshl_b32 s17, s76, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -39673,11 +39585,11 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s14, s88, 8 +; SI-NEXT: s_lshl_b32 s14, s74, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 -; SI-NEXT: s_and_b32 s14, s78, 0xff +; SI-NEXT: s_and_b32 s14, s72, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s15, s74, 24 +; SI-NEXT: s_lshl_b32 s15, s62, 24 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 @@ -39851,42 +39763,42 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-NEXT: ; implicit-def: $sgpr82 ; SI-NEXT: ; implicit-def: $sgpr81 ; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr67 ; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr39 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: s_branch .LBB69_2 ; ; VI-LABEL: bitcast_v8i64_to_v64i8_scalar: @@ -41151,19 +41063,19 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 @@ -41178,24 +41090,24 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v17 ; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 ; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v13 ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v17 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v2 ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v4 ; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v6 ; SI-NEXT: s_waitcnt vmcnt(13) @@ -41213,9 +41125,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v53 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v55 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v40 ; SI-NEXT: s_waitcnt vmcnt(3) @@ -41231,47 +41143,43 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v47 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB70_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 ; SI-NEXT: v_or_b32_e32 v0, v0, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v57 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v62, v9 @@ -41286,16 +41194,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v12, v54, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr52 @@ -41305,67 +41210,79 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v16, v4 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v18 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -41412,10 +41329,10 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v40 -; SI-NEXT: v_or_b32_e32 v13, v13, v19 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v21 ; SI-NEXT: v_or_b32_e32 v14, v14, v25 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 @@ -41426,24 +41343,19 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v23 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 @@ -41470,19 +41382,20 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: .LBB70_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB70_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -41496,17 +41409,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v35, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -41526,75 +41435,85 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_mov_b32 s7, 0x3000000 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v16, v4 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_or_b32_e32 v4, v18, v4 @@ -41660,11 +41579,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v13, v19, v13 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v21 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_or_b32_e32 v14, v25, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 @@ -41682,15 +41601,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v23, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 ; SI-NEXT: .LBB70_4: ; %end @@ -41758,7 +41671,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:64 ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 @@ -41768,7 +41681,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 ; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 @@ -41798,7 +41711,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v26 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v38 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v30 ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v39 ; VI-NEXT: s_waitcnt vmcnt(7) @@ -41821,7 +41734,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 @@ -41837,20 +41750,18 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v30, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr53 @@ -41861,6 +41772,8 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -41870,43 +41783,43 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v2, v2, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -42002,10 +41915,8 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -42015,7 +41926,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v12, 3, v38 +; VI-NEXT: v_add_u16_e32 v12, 3, v30 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v13, 3, v58 @@ -42026,6 +41937,8 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -42035,56 +41948,56 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_add_u16_e32 v1, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 -; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v2, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v4, v4, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v5, v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v53 ; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v8, 3, v8 ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_sdwa v15, v17, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 -; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v15, v17, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -42212,7 +42125,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 @@ -42222,7 +42135,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 ; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 @@ -42256,7 +42169,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v26 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v38 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v30 ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v39 ; GFX9-NEXT: s_waitcnt vmcnt(7) @@ -42279,7 +42192,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 @@ -42295,20 +42208,18 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v30, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr53 @@ -42319,6 +42230,8 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -42328,43 +42241,43 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -42460,10 +42373,8 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -42473,7 +42384,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v13, 3, v58 @@ -42485,6 +42396,8 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v15, 3, v53 ; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -42494,53 +42407,53 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v16, 3, v17 ; GFX9-NEXT: v_or_b32_sdwa v16, v60, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 ; GFX9-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -43489,24 +43402,25 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v28 -; SI-NEXT: v_mov_b32_e32 v38, v26 -; SI-NEXT: v_mov_b32_e32 v49, v24 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v50, v24 ; SI-NEXT: v_mov_b32_e32 v51, v14 ; SI-NEXT: v_mov_b32_e32 v54, v12 ; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v44, v6 -; SI-NEXT: v_mov_b32_e32 v33, v4 -; SI-NEXT: v_mov_b32_e32 v32, v2 -; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: v_mov_b32_e32 v31, v8 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v4 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 @@ -43518,8 +43432,9 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v5 @@ -43532,43 +43447,48 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 -; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v6 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v10 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v14 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v26 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB71_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v61, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 ; SI-NEXT: v_or_b32_e32 v0, v0, v60 @@ -43589,135 +43509,163 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 ; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 -; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 -; SI-NEXT: v_or_b32_e32 v0, v0, v23 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 -; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: v_mov_b32_e32 v26, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v0, v0, v29 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 -; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_or_b32_e32 v0, v0, v15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: s_or_b32 s8, s4, s5 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: v_or_b32_e32 v2, v2, v62 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v61, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_mov_b32_e32 v24, v43 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_cbranch_execnz .LBB71_3 +; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s22, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s26, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v52, v42 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_cbranch_execnz .LBB71_3 -; SI-NEXT: .LBB71_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v63, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v31 ; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -43758,85 +43706,30 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 ; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v48 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s20, 0xff -; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s24, 0xff -; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s26, 0xff -; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_add_i32 s4, s4, 0x3000000 -; SI-NEXT: s_add_i32 s5, s5, 0x3000000 -; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 ; SI-NEXT: v_or_b32_e32 v0, v21, v0 @@ -43846,44 +43739,59 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v26, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v43 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -43908,9 +43816,8 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB71_4: -; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v43 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB71_2 ; @@ -43936,33 +43843,34 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: v_mov_b32_e32 v36, v28 ; VI-NEXT: v_mov_b32_e32 v35, v26 ; VI-NEXT: v_mov_b32_e32 v34, v24 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: v_mov_b32_e32 v48, v12 -; VI-NEXT: v_mov_b32_e32 v49, v10 -; VI-NEXT: v_mov_b32_e32 v50, v8 -; VI-NEXT: v_mov_b32_e32 v51, v6 -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v45, v0 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v39, v12 +; VI-NEXT: v_mov_b32_e32 v48, v10 +; VI-NEXT: v_mov_b32_e32 v49, v8 +; VI-NEXT: v_mov_b32_e32 v50, v6 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v44, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:68 ; VI-NEXT: v_mov_b32_e32 v37, v30 ; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 @@ -43976,147 +43884,179 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v4 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB71_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: v_or_b32_sdwa v0, v53, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v51, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v50, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_and_b32 s4, s20, 0xff +; VI-NEXT: s_lshl_b32 s5, s21, 8 ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v33, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v34, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s24, 0xff +; VI-NEXT: s_lshl_b32 s5, s25, 8 ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s8 ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s8, s4, s5 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v52, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v42, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v44 ; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_cbranch_execnz .LBB71_3 +; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff ; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff ; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v28, v44 -; VI-NEXT: v_mov_b32_e32 v33, v42 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_cbranch_execnz .LBB71_3 -; VI-NEXT: .LBB71_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v28 -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v45 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v43 +; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 ; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 ; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 @@ -44134,7 +44074,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 @@ -44146,90 +44086,50 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v31 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s4, s16, 0xff -; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_addk_i32 s5, 0x300 -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s6, s24, 0xff -; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_or_b32 s7, s8, s7 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v40 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 -; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v42 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB71_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -44250,9 +44150,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB71_4: -; VI-NEXT: v_mov_b32_e32 v28, v44 -; VI-NEXT: v_mov_b32_e32 v26, v4 -; VI-NEXT: v_mov_b32_e32 v33, v42 +; VI-NEXT: v_mov_b32_e32 v43, v44 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB71_2 ; @@ -44278,33 +44176,34 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: v_mov_b32_e32 v36, v28 ; GFX9-NEXT: v_mov_b32_e32 v35, v26 ; GFX9-NEXT: v_mov_b32_e32 v34, v24 -; GFX9-NEXT: v_mov_b32_e32 v39, v14 -; GFX9-NEXT: v_mov_b32_e32 v48, v12 -; GFX9-NEXT: v_mov_b32_e32 v49, v10 -; GFX9-NEXT: v_mov_b32_e32 v50, v8 -; GFX9-NEXT: v_mov_b32_e32 v51, v6 -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v45, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v39, v12 +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_mov_b32_e32 v49, v8 +; GFX9-NEXT: v_mov_b32_e32 v50, v6 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v44, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:68 ; GFX9-NEXT: v_mov_b32_e32 v37, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 @@ -44318,7 +44217,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v25 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v27 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(19) @@ -44326,256 +44225,248 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v4 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v24 ; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s21, 8 ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s7, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s8 ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s8, s4, s5 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v43, v44 ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB71_3 +; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s19, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s23, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v28, v44 -; GFX9-NEXT: v_mov_b32_e32 v33, v42 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_cbranch_execnz .LBB71_3 -; GFX9-NEXT: .LBB71_2: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v1, 3, v28 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v26 -; GFX9-NEXT: s_movk_i32 s4, 0x300 -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 ; GFX9-NEXT: s_add_i32 s28, s28, 3 -; GFX9-NEXT: s_and_b32 s5, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s29, 8 -; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 -; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v51 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v50 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 ; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 ; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v35 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 ; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v40 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v31 ; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_and_b32 s5, s16, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s17, 8 -; GFX9-NEXT: s_add_i32 s18, s18, 3 -; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: s_and_b32 s6, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s19, 8 -; GFX9-NEXT: s_or_b32 s6, s7, s6 -; GFX9-NEXT: s_addk_i32 s5, 0x300 -; GFX9-NEXT: s_addk_i32 s6, 0x300 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_add_i32 s20, s20, 3 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s21, 8 -; GFX9-NEXT: s_add_i32 s22, s22, 3 -; GFX9-NEXT: s_or_b32 s6, s7, s6 -; GFX9-NEXT: s_and_b32 s7, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s23, 8 -; GFX9-NEXT: s_or_b32 s7, s8, s7 -; GFX9-NEXT: s_addk_i32 s6, 0x300 -; GFX9-NEXT: s_addk_i32 s7, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_add_i32 s24, s24, 3 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s25, 8 -; GFX9-NEXT: s_add_i32 s26, s26, 3 -; GFX9-NEXT: s_or_b32 s7, s8, s7 -; GFX9-NEXT: s_and_b32 s8, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s27, 8 -; GFX9-NEXT: s_or_b32 s8, s9, s8 -; GFX9-NEXT: s_addk_i32 s7, 0x300 -; GFX9-NEXT: s_addk_i32 s8, 0x300 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v52 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: .LBB71_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -44596,9 +44487,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB71_4: -; GFX9-NEXT: v_mov_b32_e32 v28, v44 -; GFX9-NEXT: v_mov_b32_e32 v26, v4 -; GFX9-NEXT: v_mov_b32_e32 v33, v42 +; GFX9-NEXT: v_mov_b32_e32 v43, v44 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB71_2 ; @@ -45392,22 +45281,24 @@ define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v8f64_to_v32i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v15 -; SI-NEXT: v_mov_b32_e32 v54, v14 -; SI-NEXT: v_mov_b32_e32 v53, v13 -; SI-NEXT: v_mov_b32_e32 v52, v12 -; SI-NEXT: v_mov_b32_e32 v51, v11 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v49, v9 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_mov_b32_e32 v38, v7 -; SI-NEXT: v_mov_b32_e32 v37, v6 -; SI-NEXT: v_mov_b32_e32 v36, v5 -; SI-NEXT: v_mov_b32_e32 v35, v4 -; SI-NEXT: v_mov_b32_e32 v34, v3 -; SI-NEXT: v_mov_b32_e32 v33, v2 +; SI-NEXT: v_mov_b32_e32 v33, v15 +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: v_mov_b32_e32 v35, v13 +; SI-NEXT: v_mov_b32_e32 v34, v12 +; SI-NEXT: v_mov_b32_e32 v37, v11 +; SI-NEXT: v_mov_b32_e32 v36, v10 +; SI-NEXT: v_mov_b32_e32 v39, v9 +; SI-NEXT: v_mov_b32_e32 v38, v8 +; SI-NEXT: v_mov_b32_e32 v49, v7 +; SI-NEXT: v_mov_b32_e32 v48, v6 +; SI-NEXT: v_mov_b32_e32 v51, v5 +; SI-NEXT: v_mov_b32_e32 v50, v4 +; SI-NEXT: v_mov_b32_e32 v53, v3 +; SI-NEXT: v_mov_b32_e32 v52, v2 +; SI-NEXT: v_mov_b32_e32 v55, v1 +; SI-NEXT: v_mov_b32_e32 v54, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 @@ -45427,68 +45318,68 @@ define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB72_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v29, v55, v54, 16 -; SI-NEXT: v_alignbit_b32 v25, v53, v52, 16 -; SI-NEXT: v_alignbit_b32 v21, v51, v50, 16 -; SI-NEXT: v_alignbit_b32 v17, v49, v48, 16 -; SI-NEXT: v_alignbit_b32 v13, v38, v37, 16 -; SI-NEXT: v_alignbit_b32 v9, v36, v35, 16 -; SI-NEXT: v_alignbit_b32 v5, v34, v33, 16 -; SI-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_alignbit_b32 v29, v33, v32, 16 +; SI-NEXT: v_alignbit_b32 v25, v35, v34, 16 +; SI-NEXT: v_alignbit_b32 v21, v37, v36, 16 +; SI-NEXT: v_alignbit_b32 v17, v39, v38, 16 +; SI-NEXT: v_alignbit_b32 v13, v49, v48, 16 +; SI-NEXT: v_alignbit_b32 v9, v51, v50, 16 +; SI-NEXT: v_alignbit_b32 v5, v53, v52, 16 +; SI-NEXT: v_alignbit_b32 v1, v55, v54, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 ; SI-NEXT: .LBB72_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB72_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 -; SI-NEXT: v_add_f64 v[35:36], v[35:36], 1.0 -; SI-NEXT: v_add_f64 v[37:38], v[37:38], 1.0 -; SI-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 -; SI-NEXT: v_add_f64 v[50:51], v[50:51], 1.0 ; SI-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 ; SI-NEXT: v_add_f64 v[52:53], v[52:53], 1.0 -; SI-NEXT: v_alignbit_b32 v29, v55, v54, 16 -; SI-NEXT: v_alignbit_b32 v25, v53, v52, 16 -; SI-NEXT: v_alignbit_b32 v21, v51, v50, 16 -; SI-NEXT: v_alignbit_b32 v17, v49, v48, 16 -; SI-NEXT: v_alignbit_b32 v13, v38, v37, 16 -; SI-NEXT: v_alignbit_b32 v9, v36, v35, 16 -; SI-NEXT: v_alignbit_b32 v5, v34, v33, 16 -; SI-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_add_f64 v[50:51], v[50:51], 1.0 +; SI-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 +; SI-NEXT: v_add_f64 v[38:39], v[38:39], 1.0 +; SI-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 +; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; SI-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 +; SI-NEXT: v_alignbit_b32 v29, v33, v32, 16 +; SI-NEXT: v_alignbit_b32 v25, v35, v34, 16 +; SI-NEXT: v_alignbit_b32 v21, v37, v36, 16 +; SI-NEXT: v_alignbit_b32 v17, v39, v38, 16 +; SI-NEXT: v_alignbit_b32 v13, v49, v48, 16 +; SI-NEXT: v_alignbit_b32 v9, v51, v50, 16 +; SI-NEXT: v_alignbit_b32 v5, v53, v52, 16 +; SI-NEXT: v_alignbit_b32 v1, v55, v54, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 ; SI-NEXT: .LBB72_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v4, v33 -; SI-NEXT: v_mov_b32_e32 v6, v34 -; SI-NEXT: v_mov_b32_e32 v8, v35 -; SI-NEXT: v_mov_b32_e32 v10, v36 -; SI-NEXT: v_mov_b32_e32 v12, v37 -; SI-NEXT: v_mov_b32_e32 v14, v38 -; SI-NEXT: v_mov_b32_e32 v16, v48 -; SI-NEXT: v_mov_b32_e32 v18, v49 -; SI-NEXT: v_mov_b32_e32 v20, v50 -; SI-NEXT: v_mov_b32_e32 v22, v51 -; SI-NEXT: v_mov_b32_e32 v24, v52 -; SI-NEXT: v_mov_b32_e32 v26, v53 -; SI-NEXT: v_mov_b32_e32 v28, v54 -; SI-NEXT: v_mov_b32_e32 v30, v55 -; SI-NEXT: v_mov_b32_e32 v1, v32 +; SI-NEXT: v_mov_b32_e32 v0, v54 +; SI-NEXT: v_mov_b32_e32 v2, v55 +; SI-NEXT: v_mov_b32_e32 v4, v52 +; SI-NEXT: v_mov_b32_e32 v6, v53 +; SI-NEXT: v_mov_b32_e32 v8, v50 +; SI-NEXT: v_mov_b32_e32 v10, v51 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v14, v49 +; SI-NEXT: v_mov_b32_e32 v16, v38 +; SI-NEXT: v_mov_b32_e32 v18, v39 +; SI-NEXT: v_mov_b32_e32 v20, v36 +; SI-NEXT: v_mov_b32_e32 v22, v37 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v26, v35 +; SI-NEXT: v_mov_b32_e32 v28, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f64_to_v32i16: @@ -45578,80 +45469,80 @@ define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_mov_b32_e32 v33, v1 ; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: v_mov_b32_e32 v34, s16 -; SI-NEXT: v_mov_b32_e32 v35, s17 -; SI-NEXT: v_mov_b32_e32 v36, s18 -; SI-NEXT: v_mov_b32_e32 v37, s19 -; SI-NEXT: v_mov_b32_e32 v38, s20 -; SI-NEXT: v_mov_b32_e32 v39, s21 +; SI-NEXT: v_mov_b32_e32 v54, s16 +; SI-NEXT: v_mov_b32_e32 v55, s17 +; SI-NEXT: v_mov_b32_e32 v52, s18 +; SI-NEXT: v_mov_b32_e32 v53, s19 +; SI-NEXT: v_mov_b32_e32 v50, s20 +; SI-NEXT: v_mov_b32_e32 v51, s21 ; SI-NEXT: v_mov_b32_e32 v48, s22 ; SI-NEXT: v_mov_b32_e32 v49, s23 -; SI-NEXT: v_mov_b32_e32 v50, s24 -; SI-NEXT: v_mov_b32_e32 v51, s25 -; SI-NEXT: v_mov_b32_e32 v52, s26 -; SI-NEXT: v_mov_b32_e32 v53, s27 -; SI-NEXT: v_mov_b32_e32 v54, s28 +; SI-NEXT: v_mov_b32_e32 v38, s24 +; SI-NEXT: v_mov_b32_e32 v39, s25 +; SI-NEXT: v_mov_b32_e32 v36, s26 +; SI-NEXT: v_mov_b32_e32 v37, s27 +; SI-NEXT: v_mov_b32_e32 v34, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v55, s29 +; SI-NEXT: v_mov_b32_e32 v35, s29 ; SI-NEXT: s_cbranch_scc0 .LBB73_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v39 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 ; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[38:39], 16 ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 ; SI-NEXT: s_cbranch_execnz .LBB73_3 ; SI-NEXT: .LBB73_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 -; SI-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 -; SI-NEXT: v_add_f64 v[52:53], v[52:53], 1.0 -; SI-NEXT: v_add_f64 v[50:51], v[50:51], 1.0 -; SI-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 -; SI-NEXT: v_add_f64 v[38:39], v[38:39], 1.0 -; SI-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 ; SI-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 +; SI-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 +; SI-NEXT: v_add_f64 v[38:39], v[38:39], 1.0 +; SI-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 +; SI-NEXT: v_add_f64 v[50:51], v[50:51], 1.0 +; SI-NEXT: v_add_f64 v[52:53], v[52:53], 1.0 +; SI-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 ; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[38:39], 16 ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 -; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v39 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 ; SI-NEXT: .LBB73_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v34 -; SI-NEXT: v_mov_b32_e32 v2, v35 -; SI-NEXT: v_mov_b32_e32 v4, v36 -; SI-NEXT: v_mov_b32_e32 v6, v37 -; SI-NEXT: v_mov_b32_e32 v8, v38 -; SI-NEXT: v_mov_b32_e32 v10, v39 +; SI-NEXT: v_mov_b32_e32 v0, v54 +; SI-NEXT: v_mov_b32_e32 v2, v55 +; SI-NEXT: v_mov_b32_e32 v4, v52 +; SI-NEXT: v_mov_b32_e32 v6, v53 +; SI-NEXT: v_mov_b32_e32 v8, v50 +; SI-NEXT: v_mov_b32_e32 v10, v51 ; SI-NEXT: v_mov_b32_e32 v12, v48 ; SI-NEXT: v_mov_b32_e32 v14, v49 -; SI-NEXT: v_mov_b32_e32 v16, v50 -; SI-NEXT: v_mov_b32_e32 v18, v51 -; SI-NEXT: v_mov_b32_e32 v20, v52 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v24, v54 -; SI-NEXT: v_mov_b32_e32 v26, v55 +; SI-NEXT: v_mov_b32_e32 v16, v38 +; SI-NEXT: v_mov_b32_e32 v18, v39 +; SI-NEXT: v_mov_b32_e32 v20, v36 +; SI-NEXT: v_mov_b32_e32 v22, v37 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v26, v35 ; SI-NEXT: v_mov_b32_e32 v28, v32 ; SI-NEXT: v_mov_b32_e32 v30, v33 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -45668,10 +45559,10 @@ define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_branch .LBB73_2 ; ; VI-LABEL: bitcast_v8f64_to_v32i16_scalar: @@ -46153,40 +46044,40 @@ define inreg <8 x double> @bitcast_v32i16_to_v8f64_scalar(<32 x i16> inreg %a, i ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: v_or_b32_e32 v7, v0, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v23 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v9, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v8, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v10, v0, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v9, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v21 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v11, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v10, v0, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v12, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v11, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v19 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v13, v0, v27 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v12, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v25 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v13, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v32 ; SI-NEXT: v_or_b32_e32 v15, v0, v17 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -47208,41 +47099,41 @@ define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB78_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v53 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 @@ -48503,18 +48394,18 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB82_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -49794,53 +49685,53 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e64 v60, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v59, 1.0, s19 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mul_f32_e64 v58, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s16 ; SI-NEXT: v_mul_f32_e32 v56, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v63, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v62, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v61, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v58, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v28, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v63, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v62, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v60, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v59, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v36, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v57, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v34, 1.0, s28 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v59 -; SI-NEXT: v_lshr_b64 v[0:1], v[54:55], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[52:53], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v63 -; SI-NEXT: v_lshr_b64 v[2:3], v[50:51], 16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v62 -; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 -; SI-NEXT: v_lshr_b64 v[4:5], v[38:39], 16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v58 +; SI-NEXT: v_lshr_b64 v[0:1], v[38:39], 16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v63 +; SI-NEXT: v_lshr_b64 v[1:2], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v62 +; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v61 +; SI-NEXT: v_lshr_b64 v[3:4], v[50:51], 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v60 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v59 ; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v57 ; SI-NEXT: v_lshr_b64 v[6:7], v[34:35], 16 @@ -49858,46 +49749,46 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI-NEXT: v_lshr_b64 v[12:13], v[22:23], 16 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v42 ; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: v_mov_b32_e32 v14, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 -; SI-NEXT: v_mov_b32_e32 v15, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 ; SI-NEXT: v_lshr_b64 v[39:40], v[16:17], 16 -; SI-NEXT: v_mov_b32_e32 v17, v15 +; SI-NEXT: v_mov_b32_e32 v17, v14 +; SI-NEXT: v_lshr_b64 v[14:15], v[18:19], 16 ; SI-NEXT: v_mov_b32_e32 v15, v39 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v59 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -49990,8 +49881,8 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-LABEL: bitcast_v32bf16_to_v8f64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v10, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 @@ -50004,11 +49895,11 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 -; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: v_mov_b32_e32 v13, s29 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB83_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB83_3 @@ -51380,8 +51271,8 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 @@ -51394,8 +51285,8 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 @@ -51453,8 +51344,8 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 ; SI-NEXT: v_alignbit_b32 v40, v4, v3, 16 ; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 -; SI-NEXT: v_alignbit_b32 v47, v2, v1, 24 -; SI-NEXT: v_alignbit_b32 v57, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 ; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 @@ -51473,8 +51364,8 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 ; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v6 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 ; SI-NEXT: s_waitcnt expcnt(4) @@ -51521,8 +51412,8 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v55, v4, v3, 24 ; SI-NEXT: v_alignbit_b32 v40, v4, v3, 16 ; SI-NEXT: v_alignbit_b32 v43, v4, v3, 8 -; SI-NEXT: v_alignbit_b32 v47, v2, v1, 24 -; SI-NEXT: v_alignbit_b32 v57, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v46, v2, v1, 24 +; SI-NEXT: v_alignbit_b32 v56, v2, v1, 16 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_alignbit_b32 v58, v2, v1, 8 ; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 @@ -51541,8 +51432,8 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v8 ; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v6 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v6 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v4 ; SI-NEXT: s_waitcnt expcnt(4) @@ -51557,16 +51448,16 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 ; SI-NEXT: .LBB84_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 ; SI-NEXT: v_or_b32_e32 v1, v1, v58 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v47 -; SI-NEXT: v_or_b32_e32 v47, v47, v57 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v46 +; SI-NEXT: v_or_b32_e32 v46, v46, v56 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v46 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 @@ -51618,9 +51509,9 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v57 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v45 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -51773,9 +51664,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -51792,108 +51681,107 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr61 ; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; kill: killed $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB84_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] -; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v16 ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; VI-NEXT: v_mov_b32_e32 v26, v22 -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; VI-NEXT: .LBB84_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB84_4 @@ -51902,175 +51790,172 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; VI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; VI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; VI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 ; VI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; VI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; VI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v25, 24, v16 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; VI-NEXT: .LBB84_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v27, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v28 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v26, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; VI-NEXT: v_or_b32_sdwa v2, v2, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 ; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v21 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 ; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v36 ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v30 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 ; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 -; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -52096,9 +51981,7 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; kill: killed $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -52115,108 +51998,107 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr27 -; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB84_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v16 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v16 ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v26, v23 -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX9-NEXT: .LBB84_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB84_4 @@ -52226,159 +52108,156 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; GFX9-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX9-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 ; GFX9-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; GFX9-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; GFX9-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 24, v16 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX9-NEXT: .LBB84_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v27, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v26, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v46 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v42 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21 ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54 ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v49 ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v48 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19 ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v31 ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v30 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 ; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v26 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -52994,8 +52873,6 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v40, s81, 25 ; SI-NEXT: v_writelane_b32 v40, s82, 26 ; SI-NEXT: v_writelane_b32 v40, s83, 27 -; SI-NEXT: v_writelane_b32 v40, s84, 28 -; SI-NEXT: v_writelane_b32 v40, s85, 29 ; SI-NEXT: v_mov_b32_e32 v4, s16 ; SI-NEXT: v_mov_b32_e32 v5, s17 ; SI-NEXT: v_mov_b32_e32 v6, s18 @@ -53011,7 +52888,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v16, s28 ; SI-NEXT: v_mov_b32_e32 v17, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_writelane_b32 v40, s86, 30 +; SI-NEXT: v_writelane_b32 v40, s84, 28 ; SI-NEXT: v_readfirstlane_b32 s18, v4 ; SI-NEXT: v_readfirstlane_b32 s19, v5 ; SI-NEXT: v_readfirstlane_b32 s16, v6 @@ -53029,222 +52906,222 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_readfirstlane_b32 s4, v1 ; SI-NEXT: s_and_b64 s[20:21], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v2 -; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: v_writelane_b32 v40, s85, 29 ; SI-NEXT: s_cbranch_scc0 .LBB85_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshr_b32 s48, s5, 24 -; SI-NEXT: s_lshr_b32 s49, s5, 16 -; SI-NEXT: s_lshr_b32 s50, s5, 8 -; SI-NEXT: s_lshr_b32 s51, s7, 24 -; SI-NEXT: s_lshr_b32 s52, s7, 16 -; SI-NEXT: s_lshr_b32 s53, s7, 8 -; SI-NEXT: s_lshr_b32 s54, s9, 24 -; SI-NEXT: s_lshr_b32 s55, s9, 16 -; SI-NEXT: s_lshr_b32 s64, s9, 8 -; SI-NEXT: s_lshr_b32 s65, s11, 24 -; SI-NEXT: s_lshr_b32 s66, s11, 16 -; SI-NEXT: s_lshr_b32 s67, s11, 8 -; SI-NEXT: s_lshr_b32 s68, s13, 24 -; SI-NEXT: s_lshr_b32 s69, s13, 16 -; SI-NEXT: s_lshr_b32 s70, s13, 8 -; SI-NEXT: s_lshr_b32 s71, s15, 24 -; SI-NEXT: s_lshr_b32 s80, s15, 16 -; SI-NEXT: s_lshr_b32 s81, s15, 8 -; SI-NEXT: s_lshr_b32 s82, s17, 24 -; SI-NEXT: s_lshr_b32 s83, s17, 16 -; SI-NEXT: s_lshr_b32 s84, s17, 8 -; SI-NEXT: s_lshr_b32 s85, s19, 24 -; SI-NEXT: s_lshr_b32 s86, s19, 16 -; SI-NEXT: s_lshr_b32 s87, s19, 8 +; SI-NEXT: s_lshr_b32 s38, s5, 24 +; SI-NEXT: s_lshr_b32 s39, s5, 16 +; SI-NEXT: s_lshr_b32 s48, s5, 8 +; SI-NEXT: s_lshr_b32 s49, s7, 24 +; SI-NEXT: s_lshr_b32 s50, s7, 16 +; SI-NEXT: s_lshr_b32 s51, s7, 8 +; SI-NEXT: s_lshr_b32 s52, s9, 24 +; SI-NEXT: s_lshr_b32 s53, s9, 16 +; SI-NEXT: s_lshr_b32 s54, s9, 8 +; SI-NEXT: s_lshr_b32 s55, s11, 24 +; SI-NEXT: s_lshr_b32 s64, s11, 16 +; SI-NEXT: s_lshr_b32 s65, s11, 8 +; SI-NEXT: s_lshr_b32 s66, s13, 24 +; SI-NEXT: s_lshr_b32 s67, s13, 16 +; SI-NEXT: s_lshr_b32 s68, s13, 8 +; SI-NEXT: s_lshr_b32 s69, s15, 24 +; SI-NEXT: s_lshr_b32 s70, s15, 16 +; SI-NEXT: s_lshr_b32 s71, s15, 8 +; SI-NEXT: s_lshr_b32 s80, s17, 24 +; SI-NEXT: s_lshr_b32 s81, s17, 16 +; SI-NEXT: s_lshr_b32 s82, s17, 8 +; SI-NEXT: s_lshr_b32 s83, s19, 24 +; SI-NEXT: s_lshr_b32 s84, s19, 16 +; SI-NEXT: s_lshr_b32 s85, s19, 8 ; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[4:5], 8 -; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[60:61], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[76:77], s[8:9], 24 -; SI-NEXT: s_lshr_b64 s[78:79], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[10:11], 24 -; SI-NEXT: s_lshr_b64 s[94:95], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[36:37], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[38:39], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[22:23], s[14:15], 24 -; SI-NEXT: s_lshr_b64 s[26:27], s[14:15], 16 -; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 8 -; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 8 -; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 24 -; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[76:77], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[18:19], 8 ; SI-NEXT: s_cbranch_execnz .LBB85_4 ; SI-NEXT: .LBB85_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[28:29], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 ; SI-NEXT: v_add_f64 v[13:14], s[12:13], 1.0 -; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 -; SI-NEXT: v_lshr_b64 v[48:49], v[28:29], 24 +; SI-NEXT: v_add_f64 v[26:27], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 +; SI-NEXT: v_lshr_b64 v[33:34], v[13:14], 24 +; SI-NEXT: v_lshr_b64 v[48:49], v[26:27], 24 ; SI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[20:21], s[14:15], 1.0 -; SI-NEXT: v_add_f64 v[32:33], s[18:19], 1.0 -; SI-NEXT: v_lshr_b64 v[22:23], v[5:6], 16 +; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[19:20], s[14:15], 1.0 +; SI-NEXT: v_add_f64 v[31:32], s[18:19], 1.0 ; SI-NEXT: v_lshr_b64 v[34:35], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[28:29], 16 -; SI-NEXT: v_lshr_b64 v[15:16], v[3:4], 24 -; SI-NEXT: v_lshr_b64 v[23:24], v[5:6], 8 +; SI-NEXT: v_lshr_b64 v[49:50], v[26:27], 16 +; SI-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 +; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 24 ; SI-NEXT: v_lshr_b64 v[35:36], v[13:14], 8 -; SI-NEXT: v_lshr_b64 v[50:51], v[28:29], 8 +; SI-NEXT: v_lshr_b64 v[50:51], v[26:27], 8 ; SI-NEXT: v_lshr_b64 v[9:10], v[1:2], 24 -; SI-NEXT: v_lshr_b64 v[16:17], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 24 -; SI-NEXT: v_lshr_b64 v[36:37], v[20:21], 24 -; SI-NEXT: v_lshr_b64 v[51:52], v[32:33], 24 +; SI-NEXT: v_lshr_b64 v[15:16], v[3:4], 24 +; SI-NEXT: v_lshr_b64 v[22:23], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[19:20], 24 +; SI-NEXT: v_lshr_b64 v[51:52], v[31:32], 24 ; SI-NEXT: v_lshr_b64 v[10:11], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[17:18], v[3:4], 8 -; SI-NEXT: v_lshr_b64 v[25:26], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[20:21], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[32:33], 16 -; SI-NEXT: v_readfirstlane_b32 s19, v33 -; SI-NEXT: v_readfirstlane_b32 s17, v29 -; SI-NEXT: v_readfirstlane_b32 s15, v21 +; SI-NEXT: v_lshr_b64 v[16:17], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[5:6], 8 +; SI-NEXT: v_lshr_b64 v[28:29], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[31:32], 16 +; SI-NEXT: v_readfirstlane_b32 s19, v32 +; SI-NEXT: v_readfirstlane_b32 s17, v27 +; SI-NEXT: v_readfirstlane_b32 s15, v20 ; SI-NEXT: v_readfirstlane_b32 s13, v14 ; SI-NEXT: v_readfirstlane_b32 s11, v8 ; SI-NEXT: v_readfirstlane_b32 s9, v6 ; SI-NEXT: v_readfirstlane_b32 s7, v4 ; SI-NEXT: v_readfirstlane_b32 s5, v2 ; SI-NEXT: v_lshr_b64 v[11:12], v[1:2], 8 -; SI-NEXT: v_lshr_b64 v[18:19], v[5:6], 24 -; SI-NEXT: v_lshr_b64 v[26:27], v[7:8], 8 -; SI-NEXT: v_lshr_b64 v[30:31], v[13:14], 24 -; SI-NEXT: v_lshr_b64 v[38:39], v[20:21], 8 -; SI-NEXT: v_lshr_b64 v[53:54], v[32:33], 8 -; SI-NEXT: s_lshr_b32 s48, s5, 24 -; SI-NEXT: s_lshr_b32 s49, s5, 16 -; SI-NEXT: s_lshr_b32 s50, s5, 8 -; SI-NEXT: s_lshr_b32 s51, s7, 24 -; SI-NEXT: s_lshr_b32 s52, s7, 16 -; SI-NEXT: s_lshr_b32 s53, s7, 8 -; SI-NEXT: s_lshr_b32 s54, s9, 24 -; SI-NEXT: s_lshr_b32 s55, s9, 16 -; SI-NEXT: s_lshr_b32 s64, s9, 8 -; SI-NEXT: s_lshr_b32 s65, s11, 24 -; SI-NEXT: s_lshr_b32 s66, s11, 16 -; SI-NEXT: s_lshr_b32 s67, s11, 8 -; SI-NEXT: s_lshr_b32 s68, s13, 24 -; SI-NEXT: s_lshr_b32 s69, s13, 16 -; SI-NEXT: s_lshr_b32 s70, s13, 8 -; SI-NEXT: s_lshr_b32 s71, s15, 24 -; SI-NEXT: s_lshr_b32 s80, s15, 16 -; SI-NEXT: s_lshr_b32 s81, s15, 8 -; SI-NEXT: s_lshr_b32 s82, s17, 24 -; SI-NEXT: s_lshr_b32 s83, s17, 16 -; SI-NEXT: s_lshr_b32 s84, s17, 8 -; SI-NEXT: s_lshr_b32 s85, s19, 24 -; SI-NEXT: s_lshr_b32 s86, s19, 16 -; SI-NEXT: s_lshr_b32 s87, s19, 8 +; SI-NEXT: v_lshr_b64 v[17:18], v[3:4], 8 +; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 24 +; SI-NEXT: v_lshr_b64 v[29:30], v[7:8], 8 +; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 8 +; SI-NEXT: v_lshr_b64 v[53:54], v[31:32], 8 +; SI-NEXT: s_lshr_b32 s38, s5, 24 +; SI-NEXT: s_lshr_b32 s39, s5, 16 +; SI-NEXT: s_lshr_b32 s48, s5, 8 +; SI-NEXT: s_lshr_b32 s49, s7, 24 +; SI-NEXT: s_lshr_b32 s50, s7, 16 +; SI-NEXT: s_lshr_b32 s51, s7, 8 +; SI-NEXT: s_lshr_b32 s52, s9, 24 +; SI-NEXT: s_lshr_b32 s53, s9, 16 +; SI-NEXT: s_lshr_b32 s54, s9, 8 +; SI-NEXT: s_lshr_b32 s55, s11, 24 +; SI-NEXT: s_lshr_b32 s64, s11, 16 +; SI-NEXT: s_lshr_b32 s65, s11, 8 +; SI-NEXT: s_lshr_b32 s66, s13, 24 +; SI-NEXT: s_lshr_b32 s67, s13, 16 +; SI-NEXT: s_lshr_b32 s68, s13, 8 +; SI-NEXT: s_lshr_b32 s69, s15, 24 +; SI-NEXT: s_lshr_b32 s70, s15, 16 +; SI-NEXT: s_lshr_b32 s71, s15, 8 +; SI-NEXT: s_lshr_b32 s80, s17, 24 +; SI-NEXT: s_lshr_b32 s81, s17, 16 +; SI-NEXT: s_lshr_b32 s82, s17, 8 +; SI-NEXT: s_lshr_b32 s83, s19, 24 +; SI-NEXT: s_lshr_b32 s84, s19, 16 +; SI-NEXT: s_lshr_b32 s85, s19, 8 ; SI-NEXT: s_branch .LBB85_5 ; SI-NEXT: .LBB85_3: -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr87 -; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr85 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr84 ; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr81 ; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr67 ; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: s_branch .LBB85_2 ; SI-NEXT: .LBB85_4: -; SI-NEXT: v_mov_b32_e32 v32, s18 -; SI-NEXT: v_mov_b32_e32 v28, s16 -; SI-NEXT: v_mov_b32_e32 v20, s14 +; SI-NEXT: v_mov_b32_e32 v31, s18 +; SI-NEXT: v_mov_b32_e32 v26, s16 +; SI-NEXT: v_mov_b32_e32 v19, s14 ; SI-NEXT: v_mov_b32_e32 v13, s12 ; SI-NEXT: v_mov_b32_e32 v7, s10 ; SI-NEXT: v_mov_b32_e32 v5, s8 ; SI-NEXT: v_mov_b32_e32 v3, s6 ; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_mov_b32_e32 v53, s74 -; SI-NEXT: v_mov_b32_e32 v52, s62 -; SI-NEXT: v_mov_b32_e32 v51, s58 -; SI-NEXT: v_mov_b32_e32 v50, s56 -; SI-NEXT: v_mov_b32_e32 v49, s44 -; SI-NEXT: v_mov_b32_e32 v48, s42 -; SI-NEXT: v_mov_b32_e32 v38, s40 -; SI-NEXT: v_mov_b32_e32 v37, s26 -; SI-NEXT: v_mov_b32_e32 v36, s22 -; SI-NEXT: v_mov_b32_e32 v35, s38 -; SI-NEXT: v_mov_b32_e32 v34, s36 -; SI-NEXT: v_mov_b32_e32 v30, s34 -; SI-NEXT: v_mov_b32_e32 v26, s30 -; SI-NEXT: v_mov_b32_e32 v25, s94 -; SI-NEXT: v_mov_b32_e32 v24, s92 -; SI-NEXT: v_mov_b32_e32 v23, s88 -; SI-NEXT: v_mov_b32_e32 v22, s78 -; SI-NEXT: v_mov_b32_e32 v18, s76 -; SI-NEXT: v_mov_b32_e32 v17, s72 -; SI-NEXT: v_mov_b32_e32 v16, s60 -; SI-NEXT: v_mov_b32_e32 v15, s46 -; SI-NEXT: v_mov_b32_e32 v11, s28 -; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v53, s36 +; SI-NEXT: v_mov_b32_e32 v52, s34 +; SI-NEXT: v_mov_b32_e32 v51, s30 +; SI-NEXT: v_mov_b32_e32 v50, s94 +; SI-NEXT: v_mov_b32_e32 v49, s92 +; SI-NEXT: v_mov_b32_e32 v48, s90 +; SI-NEXT: v_mov_b32_e32 v38, s88 +; SI-NEXT: v_mov_b32_e32 v37, s78 +; SI-NEXT: v_mov_b32_e32 v36, s76 +; SI-NEXT: v_mov_b32_e32 v35, s74 +; SI-NEXT: v_mov_b32_e32 v34, s72 +; SI-NEXT: v_mov_b32_e32 v33, s62 +; SI-NEXT: v_mov_b32_e32 v29, s60 +; SI-NEXT: v_mov_b32_e32 v28, s58 +; SI-NEXT: v_mov_b32_e32 v24, s56 +; SI-NEXT: v_mov_b32_e32 v23, s46 +; SI-NEXT: v_mov_b32_e32 v22, s44 +; SI-NEXT: v_mov_b32_e32 v21, s42 +; SI-NEXT: v_mov_b32_e32 v17, s40 +; SI-NEXT: v_mov_b32_e32 v16, s28 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v10, s22 ; SI-NEXT: v_mov_b32_e32 v9, s20 ; SI-NEXT: .LBB85_5: ; %end -; SI-NEXT: v_and_b32_e32 v2, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v31 ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v53 ; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s6, s87, 8 +; SI-NEXT: s_lshl_b32 s6, s85, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v52 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s86, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v51 +; SI-NEXT: s_and_b32 s6, s84, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v51 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s85, 24 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: s_lshl_b32 s8, s83, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -53254,21 +53131,21 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v50 ; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s6, s84, 8 +; SI-NEXT: s_lshl_b32 s6, s82, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v49 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s83, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v48 +; SI-NEXT: s_and_b32 s6, s81, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v48 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s82, 24 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: s_lshl_b32 s8, s80, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -53279,21 +53156,21 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v19 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v38 ; SI-NEXT: s_and_b32 s4, s15, 0xff -; SI-NEXT: s_lshl_b32 s6, s81, 8 +; SI-NEXT: s_lshl_b32 s6, s71, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v37 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s80, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v36 +; SI-NEXT: s_and_b32 s6, s70, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v36 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s71, 24 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: s_lshl_b32 s8, s69, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -53308,17 +53185,17 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v35 ; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s6, s70, 8 +; SI-NEXT: s_lshl_b32 s6, s68, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v34 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s69, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v30 +; SI-NEXT: s_and_b32 s6, s67, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s68, 24 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: s_lshl_b32 s8, s66, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -53331,17 +53208,17 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v29 ; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s6, s67, 8 +; SI-NEXT: s_lshl_b32 s6, s65, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v25 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v28 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s66, 0xff +; SI-NEXT: s_and_b32 s6, s64, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s65, 24 +; SI-NEXT: s_lshl_b32 s8, s55, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53358,15 +53235,15 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 ; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s6, s64, 8 +; SI-NEXT: s_lshl_b32 s6, s54, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v22 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s55, 0xff +; SI-NEXT: s_and_b32 s6, s53, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v21 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s54, 24 +; SI-NEXT: s_lshl_b32 s8, s52, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53382,16 +53259,16 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17 ; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: s_lshl_b32 s6, s53, 8 +; SI-NEXT: s_lshl_b32 s6, s51, 8 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s52, 0xff +; SI-NEXT: s_and_b32 s6, s50, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v15 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s51, 24 +; SI-NEXT: s_lshl_b32 s7, s49, 24 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53407,16 +53284,16 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: s_lshl_b32 s5, s50, 8 +; SI-NEXT: s_lshl_b32 s5, s48, 8 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s49, 0xff +; SI-NEXT: s_and_b32 s5, s39, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v9 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s48, 24 +; SI-NEXT: s_lshl_b32 s6, s38, 24 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff @@ -53429,8 +53306,6 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s87, v40, 31 -; SI-NEXT: v_readlane_b32 s86, v40, 30 ; SI-NEXT: v_readlane_b32 s85, v40, 29 ; SI-NEXT: v_readlane_b32 s84, v40, 28 ; SI-NEXT: v_readlane_b32 s83, v40, 27 @@ -53470,27 +53345,27 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-LABEL: bitcast_v8f64_to_v64i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v40, s30, 0 -; VI-NEXT: v_writelane_b32 v40, s31, 1 -; VI-NEXT: v_writelane_b32 v40, s34, 2 -; VI-NEXT: v_writelane_b32 v40, s35, 3 -; VI-NEXT: v_writelane_b32 v40, s36, 4 -; VI-NEXT: v_writelane_b32 v40, s37, 5 -; VI-NEXT: v_writelane_b32 v40, s38, 6 -; VI-NEXT: v_writelane_b32 v40, s39, 7 -; VI-NEXT: v_writelane_b32 v40, s48, 8 -; VI-NEXT: v_writelane_b32 v40, s49, 9 -; VI-NEXT: v_writelane_b32 v40, s50, 10 -; VI-NEXT: v_writelane_b32 v40, s51, 11 -; VI-NEXT: v_writelane_b32 v40, s52, 12 -; VI-NEXT: v_writelane_b32 v40, s53, 13 -; VI-NEXT: v_writelane_b32 v40, s54, 14 -; VI-NEXT: v_writelane_b32 v40, s55, 15 -; VI-NEXT: v_writelane_b32 v40, s64, 16 -; VI-NEXT: v_writelane_b32 v40, s65, 17 +; VI-NEXT: v_writelane_b32 v33, s30, 0 +; VI-NEXT: v_writelane_b32 v33, s31, 1 +; VI-NEXT: v_writelane_b32 v33, s34, 2 +; VI-NEXT: v_writelane_b32 v33, s35, 3 +; VI-NEXT: v_writelane_b32 v33, s36, 4 +; VI-NEXT: v_writelane_b32 v33, s37, 5 +; VI-NEXT: v_writelane_b32 v33, s38, 6 +; VI-NEXT: v_writelane_b32 v33, s39, 7 +; VI-NEXT: v_writelane_b32 v33, s48, 8 +; VI-NEXT: v_writelane_b32 v33, s49, 9 +; VI-NEXT: v_writelane_b32 v33, s50, 10 +; VI-NEXT: v_writelane_b32 v33, s51, 11 +; VI-NEXT: v_writelane_b32 v33, s52, 12 +; VI-NEXT: v_writelane_b32 v33, s53, 13 +; VI-NEXT: v_writelane_b32 v33, s54, 14 +; VI-NEXT: v_writelane_b32 v33, s55, 15 +; VI-NEXT: v_writelane_b32 v33, s64, 16 +; VI-NEXT: v_writelane_b32 v33, s65, 17 ; VI-NEXT: v_mov_b32_e32 v4, s16 ; VI-NEXT: v_mov_b32_e32 v5, s17 ; VI-NEXT: v_mov_b32_e32 v6, s18 @@ -53506,7 +53381,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v16, s28 ; VI-NEXT: v_mov_b32_e32 v17, s29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: v_writelane_b32 v40, s66, 18 +; VI-NEXT: v_writelane_b32 v33, s66, 18 ; VI-NEXT: v_readfirstlane_b32 s18, v4 ; VI-NEXT: v_readfirstlane_b32 s19, v5 ; VI-NEXT: v_readfirstlane_b32 s16, v6 @@ -53524,7 +53399,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: v_readfirstlane_b32 s4, v1 ; VI-NEXT: s_and_b64 s[20:21], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: v_writelane_b32 v40, s67, 19 +; VI-NEXT: v_writelane_b32 v33, s67, 19 ; VI-NEXT: s_cbranch_scc0 .LBB85_3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b32 s56, s5, 24 @@ -53577,70 +53452,70 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; VI-NEXT: s_cbranch_execnz .LBB85_4 ; VI-NEXT: .LBB85_2: ; %cmp.true -; VI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 -; VI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 -; VI-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 -; VI-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 ; VI-NEXT: v_add_f64 v[11:12], s[12:13], 1.0 ; VI-NEXT: v_add_f64 v[15:16], s[14:15], 1.0 -; VI-NEXT: v_add_f64 v[9:10], s[16:17], 1.0 -; VI-NEXT: v_add_f64 v[13:14], s[18:19], 1.0 -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] -; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] -; VI-NEXT: v_readfirstlane_b32 s19, v14 -; VI-NEXT: v_readfirstlane_b32 s17, v10 +; VI-NEXT: v_add_f64 v[19:20], s[16:17], 1.0 +; VI-NEXT: v_add_f64 v[23:24], s[18:19], 1.0 +; VI-NEXT: v_add_f64 v[9:10], s[10:11], 1.0 +; VI-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 +; VI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 +; VI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[26:27], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[19:20] +; VI-NEXT: v_readfirstlane_b32 s19, v24 +; VI-NEXT: v_readfirstlane_b32 s17, v20 ; VI-NEXT: v_readfirstlane_b32 s15, v16 ; VI-NEXT: v_readfirstlane_b32 s13, v12 -; VI-NEXT: v_readfirstlane_b32 s11, v8 +; VI-NEXT: v_readfirstlane_b32 s11, v10 ; VI-NEXT: v_readfirstlane_b32 s9, v6 ; VI-NEXT: v_readfirstlane_b32 s7, v4 ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[28:29], 24, v[23:24] ; VI-NEXT: s_lshr_b32 s56, s5, 24 ; VI-NEXT: s_lshr_b32 s57, s5, 16 ; VI-NEXT: s_lshr_b32 s58, s5, 8 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; VI-NEXT: s_lshr_b32 s59, s7, 24 ; VI-NEXT: s_lshr_b32 s60, s7, 16 ; VI-NEXT: s_lshr_b32 s61, s7, 8 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; VI-NEXT: s_lshr_b32 s62, s9, 24 ; VI-NEXT: s_lshr_b32 s63, s9, 16 ; VI-NEXT: s_lshr_b32 s72, s9, 8 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v5 ; VI-NEXT: s_lshr_b32 s73, s11, 24 ; VI-NEXT: s_lshr_b32 s74, s11, 16 ; VI-NEXT: s_lshr_b32 s75, s11, 8 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v9 ; VI-NEXT: s_lshr_b32 s76, s13, 24 ; VI-NEXT: s_lshr_b32 s77, s13, 16 ; VI-NEXT: s_lshr_b32 s78, s13, 8 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v11 ; VI-NEXT: s_lshr_b32 s79, s15, 24 ; VI-NEXT: s_lshr_b32 s88, s15, 16 ; VI-NEXT: s_lshr_b32 s89, s15, 8 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v15 ; VI-NEXT: s_lshr_b32 s90, s17, 24 ; VI-NEXT: s_lshr_b32 s91, s17, 16 ; VI-NEXT: s_lshr_b32 s30, s17, 8 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v19 ; VI-NEXT: s_lshr_b32 s31, s19, 24 ; VI-NEXT: s_lshr_b32 s34, s19, 16 ; VI-NEXT: s_lshr_b32 s35, s19, 8 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v23 ; VI-NEXT: s_branch .LBB85_5 ; VI-NEXT: .LBB85_3: ; VI-NEXT: ; implicit-def: $sgpr66 @@ -53693,181 +53568,181 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: s_branch .LBB85_2 ; VI-NEXT: .LBB85_4: -; VI-NEXT: v_mov_b32_e32 v13, s18 -; VI-NEXT: v_mov_b32_e32 v9, s16 -; VI-NEXT: v_mov_b32_e32 v48, s67 -; VI-NEXT: v_mov_b32_e32 v49, s66 -; VI-NEXT: v_mov_b32_e32 v38, s65 -; VI-NEXT: v_mov_b32_e32 v39, s64 -; VI-NEXT: v_mov_b32_e32 v36, s55 -; VI-NEXT: v_mov_b32_e32 v37, s54 -; VI-NEXT: v_mov_b32_e32 v34, s53 -; VI-NEXT: v_mov_b32_e32 v35, s52 -; VI-NEXT: v_mov_b32_e32 v32, s51 -; VI-NEXT: v_mov_b32_e32 v33, s50 -; VI-NEXT: v_mov_b32_e32 v30, s49 -; VI-NEXT: v_mov_b32_e32 v31, s48 -; VI-NEXT: v_mov_b32_e32 v28, s39 -; VI-NEXT: v_mov_b32_e32 v29, s38 -; VI-NEXT: v_mov_b32_e32 v26, s37 -; VI-NEXT: v_mov_b32_e32 v27, s36 +; VI-NEXT: v_mov_b32_e32 v23, s18 +; VI-NEXT: v_mov_b32_e32 v19, s16 ; VI-NEXT: v_mov_b32_e32 v15, s14 ; VI-NEXT: v_mov_b32_e32 v11, s12 -; VI-NEXT: v_mov_b32_e32 v7, s10 +; VI-NEXT: v_mov_b32_e32 v9, s10 ; VI-NEXT: v_mov_b32_e32 v5, s8 ; VI-NEXT: v_mov_b32_e32 v3, s6 ; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v24, s20 -; VI-NEXT: v_mov_b32_e32 v23, s22 -; VI-NEXT: v_mov_b32_e32 v22, s24 -; VI-NEXT: v_mov_b32_e32 v21, s26 -; VI-NEXT: v_mov_b32_e32 v20, s28 -; VI-NEXT: v_mov_b32_e32 v19, s40 -; VI-NEXT: v_mov_b32_e32 v18, s42 -; VI-NEXT: v_mov_b32_e32 v17, s44 +; VI-NEXT: v_mov_b32_e32 v28, s20 +; VI-NEXT: v_mov_b32_e32 v31, s67 +; VI-NEXT: v_mov_b32_e32 v32, s66 +; VI-NEXT: v_mov_b32_e32 v27, s22 +; VI-NEXT: v_mov_b32_e32 v29, s65 +; VI-NEXT: v_mov_b32_e32 v30, s64 +; VI-NEXT: v_mov_b32_e32 v26, s24 +; VI-NEXT: v_mov_b32_e32 v22, s55 +; VI-NEXT: v_mov_b32_e32 v24, s54 +; VI-NEXT: v_mov_b32_e32 v25, s26 +; VI-NEXT: v_mov_b32_e32 v18, s53 +; VI-NEXT: v_mov_b32_e32 v20, s52 +; VI-NEXT: v_mov_b32_e32 v21, s28 +; VI-NEXT: v_mov_b32_e32 v14, s51 +; VI-NEXT: v_mov_b32_e32 v16, s50 +; VI-NEXT: v_mov_b32_e32 v17, s40 +; VI-NEXT: v_mov_b32_e32 v10, s49 +; VI-NEXT: v_mov_b32_e32 v12, s48 +; VI-NEXT: v_mov_b32_e32 v13, s42 +; VI-NEXT: v_mov_b32_e32 v6, s39 +; VI-NEXT: v_mov_b32_e32 v8, s38 +; VI-NEXT: v_mov_b32_e32 v7, s44 +; VI-NEXT: v_mov_b32_e32 v2, s37 +; VI-NEXT: v_mov_b32_e32 v4, s36 ; VI-NEXT: .LBB85_5: ; %end ; VI-NEXT: s_and_b32 s4, s19, 0xff ; VI-NEXT: s_lshl_b32 s6, s35, 8 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s34, 0xff ; VI-NEXT: s_lshl_b32 s8, s31, 8 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v28 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v24 +; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v31, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v48, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v23, vcc, 4, v0 +; VI-NEXT: v_mov_b32_e32 v28, s4 ; VI-NEXT: s_and_b32 s4, s17, 0xff ; VI-NEXT: s_lshl_b32 s6, s30, 8 -; VI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v28, v23, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v30 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s91, 0xff ; VI-NEXT: s_lshl_b32 s8, s90, 8 -; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v39 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 +; VI-NEXT: v_or_b32_sdwa v19, v19, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v27 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v38, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v29, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0 +; VI-NEXT: v_or_b32_sdwa v19, v19, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v23, vcc, 8, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v19, vcc, 12, v0 +; VI-NEXT: v_mov_b32_e32 v23, s4 ; VI-NEXT: s_and_b32 s4, s15, 0xff ; VI-NEXT: s_lshl_b32 s6, s89, 8 -; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v23, v19, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v24 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s88, 0xff ; VI-NEXT: s_lshl_b32 s8, s79, 8 -; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; VI-NEXT: v_or_b32_sdwa v15, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v26 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v22, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: v_or_b32_sdwa v15, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v19, vcc, 16, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: buffer_store_dword v15, v19, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v15, vcc, 20, v0 +; VI-NEXT: v_mov_b32_e32 v19, s4 ; VI-NEXT: s_and_b32 s4, s13, 0xff ; VI-NEXT: s_lshl_b32 s6, s78, 8 -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v19, v15, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v20 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s77, 0xff ; VI-NEXT: s_lshl_b32 s8, s76, 8 -; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v21 +; VI-NEXT: v_or_b32_sdwa v11, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v25 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v18, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v4, vcc, 24, v0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v15, vcc, 24, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v11, vcc, 28, v0 +; VI-NEXT: v_mov_b32_e32 v15, s4 ; VI-NEXT: s_and_b32 s4, s11, 0xff ; VI-NEXT: s_lshl_b32 s6, s75, 8 -; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v15, v11, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v16 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s74, 0xff ; VI-NEXT: s_lshl_b32 s8, s73, 8 -; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v33 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v20 +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v21 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v11, vcc, 32, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v9, vcc, 36, v0 +; VI-NEXT: v_mov_b32_e32 v11, s4 ; VI-NEXT: s_and_b32 s4, s9, 0xff ; VI-NEXT: s_lshl_b32 s6, s72, 8 -; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v11, v9, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v12 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s63, 0xff ; VI-NEXT: s_lshl_b32 s8, s62, 8 -; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v19 +; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 ; VI-NEXT: s_or_b32 s6, s6, s8 -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v9, vcc, 40, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v5, vcc, 44, v0 +; VI-NEXT: v_mov_b32_e32 v9, s4 ; VI-NEXT: s_and_b32 s4, s7, 0xff ; VI-NEXT: s_lshl_b32 s6, s61, 8 -; VI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; VI-NEXT: buffer_store_dword v9, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 ; VI-NEXT: s_or_b32 s4, s4, s6 ; VI-NEXT: s_and_b32 s6, s60, 0xff ; VI-NEXT: s_lshl_b32 s7, s59, 8 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v13 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v0 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 48, v0 ; VI-NEXT: s_or_b32 s4, s4, s6 -; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v3, vcc, 52, v0 +; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: s_and_b32 s4, s5, 0xff ; VI-NEXT: s_lshl_b32 s5, s58, 8 -; VI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; VI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s57, 0xff ; VI-NEXT: s_lshl_b32 s6, s56, 8 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 ; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -53877,28 +53752,28 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_readlane_b32 s67, v40, 19 -; VI-NEXT: v_readlane_b32 s66, v40, 18 -; VI-NEXT: v_readlane_b32 s65, v40, 17 -; VI-NEXT: v_readlane_b32 s64, v40, 16 -; VI-NEXT: v_readlane_b32 s55, v40, 15 -; VI-NEXT: v_readlane_b32 s54, v40, 14 -; VI-NEXT: v_readlane_b32 s53, v40, 13 -; VI-NEXT: v_readlane_b32 s52, v40, 12 -; VI-NEXT: v_readlane_b32 s51, v40, 11 -; VI-NEXT: v_readlane_b32 s50, v40, 10 -; VI-NEXT: v_readlane_b32 s49, v40, 9 -; VI-NEXT: v_readlane_b32 s48, v40, 8 -; VI-NEXT: v_readlane_b32 s39, v40, 7 -; VI-NEXT: v_readlane_b32 s38, v40, 6 -; VI-NEXT: v_readlane_b32 s37, v40, 5 -; VI-NEXT: v_readlane_b32 s36, v40, 4 -; VI-NEXT: v_readlane_b32 s35, v40, 3 -; VI-NEXT: v_readlane_b32 s34, v40, 2 -; VI-NEXT: v_readlane_b32 s31, v40, 1 -; VI-NEXT: v_readlane_b32 s30, v40, 0 -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: v_readlane_b32 s67, v33, 19 +; VI-NEXT: v_readlane_b32 s66, v33, 18 +; VI-NEXT: v_readlane_b32 s65, v33, 17 +; VI-NEXT: v_readlane_b32 s64, v33, 16 +; VI-NEXT: v_readlane_b32 s55, v33, 15 +; VI-NEXT: v_readlane_b32 s54, v33, 14 +; VI-NEXT: v_readlane_b32 s53, v33, 13 +; VI-NEXT: v_readlane_b32 s52, v33, 12 +; VI-NEXT: v_readlane_b32 s51, v33, 11 +; VI-NEXT: v_readlane_b32 s50, v33, 10 +; VI-NEXT: v_readlane_b32 s49, v33, 9 +; VI-NEXT: v_readlane_b32 s48, v33, 8 +; VI-NEXT: v_readlane_b32 s39, v33, 7 +; VI-NEXT: v_readlane_b32 s38, v33, 6 +; VI-NEXT: v_readlane_b32 s37, v33, 5 +; VI-NEXT: v_readlane_b32 s36, v33, 4 +; VI-NEXT: v_readlane_b32 s35, v33, 3 +; VI-NEXT: v_readlane_b32 s34, v33, 2 +; VI-NEXT: v_readlane_b32 s31, v33, 1 +; VI-NEXT: v_readlane_b32 s30, v33, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -53906,23 +53781,23 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-LABEL: bitcast_v8f64_to_v64i8_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s35, 3 -; GFX9-NEXT: v_writelane_b32 v40, s36, 4 -; GFX9-NEXT: v_writelane_b32 v40, s37, 5 -; GFX9-NEXT: v_writelane_b32 v40, s38, 6 -; GFX9-NEXT: v_writelane_b32 v40, s39, 7 -; GFX9-NEXT: v_writelane_b32 v40, s48, 8 -; GFX9-NEXT: v_writelane_b32 v40, s49, 9 -; GFX9-NEXT: v_writelane_b32 v40, s50, 10 -; GFX9-NEXT: v_writelane_b32 v40, s51, 11 -; GFX9-NEXT: v_writelane_b32 v40, s52, 12 -; GFX9-NEXT: v_writelane_b32 v40, s53, 13 +; GFX9-NEXT: v_writelane_b32 v33, s30, 0 +; GFX9-NEXT: v_writelane_b32 v33, s31, 1 +; GFX9-NEXT: v_writelane_b32 v33, s34, 2 +; GFX9-NEXT: v_writelane_b32 v33, s35, 3 +; GFX9-NEXT: v_writelane_b32 v33, s36, 4 +; GFX9-NEXT: v_writelane_b32 v33, s37, 5 +; GFX9-NEXT: v_writelane_b32 v33, s38, 6 +; GFX9-NEXT: v_writelane_b32 v33, s39, 7 +; GFX9-NEXT: v_writelane_b32 v33, s48, 8 +; GFX9-NEXT: v_writelane_b32 v33, s49, 9 +; GFX9-NEXT: v_writelane_b32 v33, s50, 10 +; GFX9-NEXT: v_writelane_b32 v33, s51, 11 +; GFX9-NEXT: v_writelane_b32 v33, s52, 12 +; GFX9-NEXT: v_writelane_b32 v33, s53, 13 ; GFX9-NEXT: v_mov_b32_e32 v4, s16 ; GFX9-NEXT: v_mov_b32_e32 v5, s17 ; GFX9-NEXT: v_mov_b32_e32 v6, s18 @@ -53938,7 +53813,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v16, s28 ; GFX9-NEXT: v_mov_b32_e32 v17, s29 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_writelane_b32 v40, s54, 14 +; GFX9-NEXT: v_writelane_b32 v33, s54, 14 ; GFX9-NEXT: v_readfirstlane_b32 s18, v4 ; GFX9-NEXT: v_readfirstlane_b32 s19, v5 ; GFX9-NEXT: v_readfirstlane_b32 s16, v6 @@ -53956,7 +53831,7 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: s_and_b64 s[20:21], vcc, exec ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: v_writelane_b32 v40, s55, 15 +; GFX9-NEXT: v_writelane_b32 v33, s55, 15 ; GFX9-NEXT: s_cbranch_scc0 .LBB85_3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_lshr_b32 s56, s5, 24 @@ -54009,70 +53884,70 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB85_4 ; GFX9-NEXT: .LBB85_2: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 ; GFX9-NEXT: v_add_f64 v[7:8], s[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], s[12:13], 1.0 ; GFX9-NEXT: v_add_f64 v[15:16], s[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[11:12], s[16:17], 1.0 -; GFX9-NEXT: v_add_f64 v[13:14], s[18:19], 1.0 -; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[15:16] -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[11:12] -; GFX9-NEXT: v_readfirstlane_b32 s19, v14 -; GFX9-NEXT: v_readfirstlane_b32 s17, v12 +; GFX9-NEXT: v_add_f64 v[17:18], s[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], s[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[5:6], s[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[17:18] +; GFX9-NEXT: v_readfirstlane_b32 s19, v22 +; GFX9-NEXT: v_readfirstlane_b32 s17, v18 ; GFX9-NEXT: v_readfirstlane_b32 s15, v16 -; GFX9-NEXT: v_readfirstlane_b32 s13, v10 +; GFX9-NEXT: v_readfirstlane_b32 s13, v12 ; GFX9-NEXT: v_readfirstlane_b32 s11, v8 ; GFX9-NEXT: v_readfirstlane_b32 s9, v6 ; GFX9-NEXT: v_readfirstlane_b32 s7, v4 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[9:10], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[21:22] ; GFX9-NEXT: s_lshr_b32 s56, s5, 24 ; GFX9-NEXT: s_lshr_b32 s57, s5, 16 ; GFX9-NEXT: s_lshr_b32 s58, s5, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX9-NEXT: s_lshr_b32 s59, s7, 24 ; GFX9-NEXT: s_lshr_b32 s60, s7, 16 ; GFX9-NEXT: s_lshr_b32 s61, s7, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX9-NEXT: s_lshr_b32 s62, s9, 24 ; GFX9-NEXT: s_lshr_b32 s63, s9, 16 ; GFX9-NEXT: s_lshr_b32 s72, s9, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v5 ; GFX9-NEXT: s_lshr_b32 s73, s11, 24 ; GFX9-NEXT: s_lshr_b32 s74, s11, 16 ; GFX9-NEXT: s_lshr_b32 s75, s11, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v7 ; GFX9-NEXT: s_lshr_b32 s76, s13, 24 ; GFX9-NEXT: s_lshr_b32 s77, s13, 16 ; GFX9-NEXT: s_lshr_b32 s78, s13, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 8, v11 ; GFX9-NEXT: s_lshr_b32 s79, s15, 24 ; GFX9-NEXT: s_lshr_b32 s88, s15, 16 ; GFX9-NEXT: s_lshr_b32 s89, s15, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v15 ; GFX9-NEXT: s_lshr_b32 s90, s17, 24 ; GFX9-NEXT: s_lshr_b32 s91, s17, 16 ; GFX9-NEXT: s_lshr_b32 s92, s17, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v17 ; GFX9-NEXT: s_lshr_b32 s93, s19, 24 ; GFX9-NEXT: s_lshr_b32 s94, s19, 16 ; GFX9-NEXT: s_lshr_b32 s95, s19, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v21 ; GFX9-NEXT: s_branch .LBB85_5 ; GFX9-NEXT: .LBB85_3: ; GFX9-NEXT: ; implicit-def: $sgpr54 @@ -54125,168 +54000,168 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: s_branch .LBB85_2 ; GFX9-NEXT: .LBB85_4: -; GFX9-NEXT: v_mov_b32_e32 v13, s18 -; GFX9-NEXT: v_mov_b32_e32 v11, s16 -; GFX9-NEXT: v_mov_b32_e32 v39, s55 -; GFX9-NEXT: v_mov_b32_e32 v49, s54 -; GFX9-NEXT: v_mov_b32_e32 v37, s53 -; GFX9-NEXT: v_mov_b32_e32 v48, s52 -; GFX9-NEXT: v_mov_b32_e32 v36, s51 -; GFX9-NEXT: v_mov_b32_e32 v38, s50 -; GFX9-NEXT: v_mov_b32_e32 v34, s49 -; GFX9-NEXT: v_mov_b32_e32 v35, s48 -; GFX9-NEXT: v_mov_b32_e32 v32, s39 -; GFX9-NEXT: v_mov_b32_e32 v33, s38 -; GFX9-NEXT: v_mov_b32_e32 v30, s37 -; GFX9-NEXT: v_mov_b32_e32 v31, s36 -; GFX9-NEXT: v_mov_b32_e32 v28, s35 -; GFX9-NEXT: v_mov_b32_e32 v29, s34 -; GFX9-NEXT: v_mov_b32_e32 v26, s31 -; GFX9-NEXT: v_mov_b32_e32 v27, s30 +; GFX9-NEXT: v_mov_b32_e32 v21, s18 +; GFX9-NEXT: v_mov_b32_e32 v17, s16 ; GFX9-NEXT: v_mov_b32_e32 v15, s14 -; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: v_mov_b32_e32 v11, s12 ; GFX9-NEXT: v_mov_b32_e32 v7, s10 ; GFX9-NEXT: v_mov_b32_e32 v5, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v24, s20 -; GFX9-NEXT: v_mov_b32_e32 v23, s22 -; GFX9-NEXT: v_mov_b32_e32 v22, s24 -; GFX9-NEXT: v_mov_b32_e32 v21, s26 -; GFX9-NEXT: v_mov_b32_e32 v20, s28 +; GFX9-NEXT: v_mov_b32_e32 v27, s20 +; GFX9-NEXT: v_mov_b32_e32 v31, s55 +; GFX9-NEXT: v_mov_b32_e32 v32, s54 +; GFX9-NEXT: v_mov_b32_e32 v26, s22 +; GFX9-NEXT: v_mov_b32_e32 v29, s53 +; GFX9-NEXT: v_mov_b32_e32 v30, s52 +; GFX9-NEXT: v_mov_b32_e32 v25, s24 +; GFX9-NEXT: v_mov_b32_e32 v22, s51 +; GFX9-NEXT: v_mov_b32_e32 v28, s50 +; GFX9-NEXT: v_mov_b32_e32 v24, s26 +; GFX9-NEXT: v_mov_b32_e32 v18, s49 +; GFX9-NEXT: v_mov_b32_e32 v20, s48 +; GFX9-NEXT: v_mov_b32_e32 v23, s28 +; GFX9-NEXT: v_mov_b32_e32 v14, s39 +; GFX9-NEXT: v_mov_b32_e32 v16, s38 ; GFX9-NEXT: v_mov_b32_e32 v19, s40 -; GFX9-NEXT: v_mov_b32_e32 v18, s42 -; GFX9-NEXT: v_mov_b32_e32 v17, s44 +; GFX9-NEXT: v_mov_b32_e32 v10, s37 +; GFX9-NEXT: v_mov_b32_e32 v12, s36 +; GFX9-NEXT: v_mov_b32_e32 v13, s42 +; GFX9-NEXT: v_mov_b32_e32 v6, s35 +; GFX9-NEXT: v_mov_b32_e32 v8, s34 +; GFX9-NEXT: v_mov_b32_e32 v9, s44 +; GFX9-NEXT: v_mov_b32_e32 v2, s31 +; GFX9-NEXT: v_mov_b32_e32 v4, s30 ; GFX9-NEXT: .LBB85_5: ; %end ; GFX9-NEXT: s_and_b32 s4, s19, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s95, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s94, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s93, 8 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v39, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v31, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen +; GFX9-NEXT: v_mov_b32_e32 v21, s4 ; GFX9-NEXT: s_and_b32 s4, s17, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s92, 8 +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v30 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s91, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s90, 8 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v48 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v26 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v29, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_mov_b32_e32 v17, s4 ; GFX9-NEXT: s_and_b32 s4, s15, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s89, 8 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v28 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s88, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s79, 8 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v38 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: s_and_b32 s4, s13, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s78, 8 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v20 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s77, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s76, 8 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v35 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v18, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_mov_b32_e32 v11, s4 ; GFX9-NEXT: s_and_b32 s4, s11, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s75, 8 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v16 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s74, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s73, 8 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v33 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v23 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 ; GFX9-NEXT: s_and_b32 s4, s9, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s72, 8 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v12 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s63, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s62, 8 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v31 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v19 ; GFX9-NEXT: s_or_b32 s6, s6, s8 -; GFX9-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 ; GFX9-NEXT: s_and_b32 s4, s7, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s61, 8 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v8 ; GFX9-NEXT: s_or_b32 s4, s4, s6 ; GFX9-NEXT: s_and_b32 s6, s60, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s59, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v13 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s6 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_and_b32 s4, s5, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s58, 8 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s57, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s56, 8 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 ; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -54294,24 +54169,24 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_readlane_b32 s55, v40, 15 -; GFX9-NEXT: v_readlane_b32 s54, v40, 14 -; GFX9-NEXT: v_readlane_b32 s53, v40, 13 -; GFX9-NEXT: v_readlane_b32 s52, v40, 12 -; GFX9-NEXT: v_readlane_b32 s51, v40, 11 -; GFX9-NEXT: v_readlane_b32 s50, v40, 10 -; GFX9-NEXT: v_readlane_b32 s49, v40, 9 -; GFX9-NEXT: v_readlane_b32 s48, v40, 8 -; GFX9-NEXT: v_readlane_b32 s39, v40, 7 -; GFX9-NEXT: v_readlane_b32 s38, v40, 6 -; GFX9-NEXT: v_readlane_b32 s37, v40, 5 -; GFX9-NEXT: v_readlane_b32 s36, v40, 4 -; GFX9-NEXT: v_readlane_b32 s35, v40, 3 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s55, v33, 15 +; GFX9-NEXT: v_readlane_b32 s54, v33, 14 +; GFX9-NEXT: v_readlane_b32 s53, v33, 13 +; GFX9-NEXT: v_readlane_b32 s52, v33, 12 +; GFX9-NEXT: v_readlane_b32 s51, v33, 11 +; GFX9-NEXT: v_readlane_b32 s50, v33, 10 +; GFX9-NEXT: v_readlane_b32 s49, v33, 9 +; GFX9-NEXT: v_readlane_b32 s48, v33, 8 +; GFX9-NEXT: v_readlane_b32 s39, v33, 7 +; GFX9-NEXT: v_readlane_b32 s38, v33, 6 +; GFX9-NEXT: v_readlane_b32 s37, v33, 5 +; GFX9-NEXT: v_readlane_b32 s36, v33, 4 +; GFX9-NEXT: v_readlane_b32 s35, v33, 3 +; GFX9-NEXT: v_readlane_b32 s34, v33, 2 +; GFX9-NEXT: v_readlane_b32 s31, v33, 1 +; GFX9-NEXT: v_readlane_b32 s30, v33, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -54736,19 +54611,19 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v34, v6 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 @@ -54763,24 +54638,24 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v17 ; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v25 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v19 ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 ; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v29 ; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v11 -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v7 +; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v13 ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v15 -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v17 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v2 ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v4 ; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v6 ; SI-NEXT: s_waitcnt vmcnt(13) @@ -54798,9 +54673,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v53 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v55 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v40 ; SI-NEXT: s_waitcnt vmcnt(3) @@ -54816,47 +54691,43 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v47 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB86_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 ; SI-NEXT: v_or_b32_e32 v0, v0, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v48, v1 -; SI-NEXT: v_or_b32_e32 v3, v35, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v57 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v62, v9 @@ -54871,16 +54742,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v12, v54, v12 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr52 @@ -54890,67 +54758,79 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v36 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v16, v4 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v3, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v37 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v18 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -54997,10 +54877,10 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v40 -; SI-NEXT: v_or_b32_e32 v13, v13, v19 +; SI-NEXT: v_or_b32_e32 v13, v13, v17 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v21 ; SI-NEXT: v_or_b32_e32 v14, v14, v25 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 @@ -55011,24 +54891,19 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v15, v23 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 @@ -55055,19 +54930,20 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: .LBB86_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB86_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -55081,17 +54957,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_or_b32_e32 v1, v34, v1 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v35, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -55111,75 +54983,85 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v13, v19, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v27, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v29 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: s_mov_b32 s7, 0x3000000 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_or_b32_e32 v2, v36, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_or_b32_e32 v3, v37, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v3, v36, v3 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v16, v4 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v20, v5 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v2, v35, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v28, v7 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_or_b32_e32 v3, v38, v3 +; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v17, v8 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_or_b32_e32 v16, v50, v16 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_or_b32_e32 v4, v18, v4 @@ -55245,11 +55127,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_or_b32_e32 v13, v19, v13 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v21 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_or_b32_e32 v14, v25, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 @@ -55267,15 +55149,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v23, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x300, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v50, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 ; SI-NEXT: .LBB86_4: ; %end @@ -55343,7 +55219,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:64 ; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 @@ -55353,7 +55229,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 ; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 @@ -55383,7 +55259,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v26 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v38 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v30 ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v39 ; VI-NEXT: s_waitcnt vmcnt(7) @@ -55406,7 +55282,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 @@ -55422,20 +55298,18 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v30, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr53 @@ -55446,6 +55320,8 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -55455,43 +55331,43 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v2, v2, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -55587,10 +55463,8 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -55600,7 +55474,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v12, 3, v38 +; VI-NEXT: v_add_u16_e32 v12, 3, v30 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v13, 3, v58 @@ -55611,6 +55485,8 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v17, 3, v17 ; VI-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -55620,56 +55496,56 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_add_u16_e32 v1, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 -; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v2, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v4, 3, v4 ; VI-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v4, v4, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_u16_e32 v5, 3, v5 ; VI-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v5, v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v6, 3, v6 ; VI-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v6, v6, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v7, 3, v7 -; VI-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v53 ; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v16, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v8, 3, v8 ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v8, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_sdwa v15, v17, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 -; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_sdwa v15, v17, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -55797,7 +55673,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:80 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 @@ -55807,7 +55683,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 ; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 @@ -55841,7 +55717,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v26 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v38 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v30 ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v39 ; GFX9-NEXT: s_waitcnt vmcnt(7) @@ -55864,7 +55740,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:84 ; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 @@ -55880,20 +55756,18 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v30, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v53, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr53 @@ -55904,6 +55778,8 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -55913,43 +55789,43 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -56045,10 +55921,8 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -56058,7 +55932,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v13, 3, v58 @@ -56070,6 +55944,8 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v15, 3, v53 ; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload @@ -56079,53 +55955,53 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 ; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 ; GFX9-NEXT: v_or_b32_sdwa v6, v24, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 -; GFX9-NEXT: v_or_b32_sdwa v7, v30, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v16, 3, v17 ; GFX9-NEXT: v_or_b32_sdwa v16, v60, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 ; GFX9-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -57074,24 +56950,25 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v28 -; SI-NEXT: v_mov_b32_e32 v38, v26 -; SI-NEXT: v_mov_b32_e32 v49, v24 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v50, v24 ; SI-NEXT: v_mov_b32_e32 v51, v14 ; SI-NEXT: v_mov_b32_e32 v54, v12 ; SI-NEXT: v_mov_b32_e32 v34, v10 -; SI-NEXT: v_mov_b32_e32 v44, v6 -; SI-NEXT: v_mov_b32_e32 v33, v4 -; SI-NEXT: v_mov_b32_e32 v32, v2 -; SI-NEXT: v_mov_b32_e32 v31, v0 +; SI-NEXT: v_mov_b32_e32 v31, v8 +; SI-NEXT: v_mov_b32_e32 v41, v6 +; SI-NEXT: v_mov_b32_e32 v42, v4 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 @@ -57103,8 +56980,9 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v5 @@ -57117,43 +56995,48 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v25 -; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v6 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v10 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v14 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v26 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB87_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v0, v0, v62 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v61, v1 +; SI-NEXT: v_or_b32_e32 v5, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 ; SI-NEXT: v_or_b32_e32 v0, v0, v60 @@ -57174,135 +57057,163 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 ; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 -; SI-NEXT: v_or_b32_e32 v0, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 -; SI-NEXT: v_or_b32_e32 v0, v0, v23 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 +; SI-NEXT: v_or_b32_e32 v11, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 -; SI-NEXT: v_or_b32_e32 v0, v0, v17 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v0, v0, v27 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: v_mov_b32_e32 v26, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v0, v0, v29 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 -; SI-NEXT: v_or_b32_e32 v0, v0, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: v_or_b32_e32 v0, v0, v15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v15, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v0, v1 +; SI-NEXT: s_or_b32 s8, s4, s5 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: v_or_b32_e32 v2, v2, v62 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v61, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_mov_b32_e32 v24, v43 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_cbranch_execnz .LBB87_3 +; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s19, 24 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_lshl_b32 s5, s19, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s22, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s26, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s8, s27, 24 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v52, v42 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_cbranch_execnz .LBB87_3 -; SI-NEXT: .LBB87_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v63, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s8, s29, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: s_addk_i32 s7, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, v50, v0 -; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_or_b32_e32 v0, v30, v0 +; SI-NEXT: v_or_b32_e32 v0, s7, v0 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v31 ; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -57343,85 +57254,30 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 ; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v48 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s20, 0xff -; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s24, 0xff -; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s26, 0xff -; SI-NEXT: s_addk_i32 s6, 0x300 -; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_add_i32 s4, s4, 0x3000000 -; SI-NEXT: s_add_i32 s5, s5, 0x3000000 -; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 ; SI-NEXT: v_or_b32_e32 v0, v21, v0 @@ -57431,44 +57287,59 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v26, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v43 -; SI-NEXT: v_or_b32_e32 v0, v28, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -57493,9 +57364,8 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB87_4: -; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v26, v8 -; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: v_mov_b32_e32 v26, v13 +; SI-NEXT: v_mov_b32_e32 v24, v43 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB87_2 ; @@ -57521,33 +57391,34 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v36, v28 ; VI-NEXT: v_mov_b32_e32 v35, v26 ; VI-NEXT: v_mov_b32_e32 v34, v24 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: v_mov_b32_e32 v48, v12 -; VI-NEXT: v_mov_b32_e32 v49, v10 -; VI-NEXT: v_mov_b32_e32 v50, v8 -; VI-NEXT: v_mov_b32_e32 v51, v6 -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v45, v0 +; VI-NEXT: v_mov_b32_e32 v33, v14 +; VI-NEXT: v_mov_b32_e32 v39, v12 +; VI-NEXT: v_mov_b32_e32 v48, v10 +; VI-NEXT: v_mov_b32_e32 v49, v8 +; VI-NEXT: v_mov_b32_e32 v50, v6 +; VI-NEXT: v_mov_b32_e32 v51, v4 +; VI-NEXT: v_mov_b32_e32 v53, v2 +; VI-NEXT: v_mov_b32_e32 v44, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:52 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:68 ; VI-NEXT: v_mov_b32_e32 v37, v30 ; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 @@ -57561,147 +57432,179 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v4 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB87_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: v_or_b32_sdwa v0, v53, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v51, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v50, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_and_b32 s4, s20, 0xff +; VI-NEXT: s_lshl_b32 s5, s21, 8 ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v33, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v34, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s24, 0xff +; VI-NEXT: s_lshl_b32 s5, s25, 8 ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s8 ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s8, s4, s5 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v52, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v1, v42, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v43, v44 ; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: s_cbranch_execnz .LBB87_3 +; VI-NEXT: .LBB87_2: ; %cmp.true +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff ; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_addk_i32 s4, 0x300 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff ; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_addk_i32 s5, 0x300 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_addk_i32 s6, 0x300 +; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v28, v44 -; VI-NEXT: v_mov_b32_e32 v33, v42 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_cbranch_execnz .LBB87_3 -; VI-NEXT: .LBB87_2: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v28 -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v45 -; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s8, s29, 8 +; VI-NEXT: s_or_b32 s7, s8, s7 +; VI-NEXT: s_addk_i32 s7, 0x300 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v43 +; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: v_or_b32_e32 v0, s7, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 ; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 ; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 ; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 @@ -57719,7 +57622,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 @@ -57731,90 +57634,50 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v31 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 -; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s4, s16, 0xff -; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_addk_i32 s5, 0x300 -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: s_or_b32 s5, s6, s5 -; VI-NEXT: s_and_b32 s6, s24, 0xff -; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_addk_i32 s6, 0x300 -; VI-NEXT: s_or_b32 s7, s8, s7 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s7, s6 -; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: s_add_i32 s5, s5, 0x3000000 -; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v40 +; VI-NEXT: s_add_i32 s4, s4, 0x3000000 +; VI-NEXT: s_add_i32 s5, s5, 0x3000000 +; VI-NEXT: s_add_i32 s6, s6, 0x3000000 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 -; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v42 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB87_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -57835,9 +57698,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB87_4: -; VI-NEXT: v_mov_b32_e32 v28, v44 -; VI-NEXT: v_mov_b32_e32 v26, v4 -; VI-NEXT: v_mov_b32_e32 v33, v42 +; VI-NEXT: v_mov_b32_e32 v43, v44 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB87_2 ; @@ -57863,33 +57724,34 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v36, v28 ; GFX9-NEXT: v_mov_b32_e32 v35, v26 ; GFX9-NEXT: v_mov_b32_e32 v34, v24 -; GFX9-NEXT: v_mov_b32_e32 v39, v14 -; GFX9-NEXT: v_mov_b32_e32 v48, v12 -; GFX9-NEXT: v_mov_b32_e32 v49, v10 -; GFX9-NEXT: v_mov_b32_e32 v50, v8 -; GFX9-NEXT: v_mov_b32_e32 v51, v6 -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v45, v0 +; GFX9-NEXT: v_mov_b32_e32 v33, v14 +; GFX9-NEXT: v_mov_b32_e32 v39, v12 +; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_mov_b32_e32 v49, v8 +; GFX9-NEXT: v_mov_b32_e32 v50, v6 +; GFX9-NEXT: v_mov_b32_e32 v51, v4 +; GFX9-NEXT: v_mov_b32_e32 v53, v2 +; GFX9-NEXT: v_mov_b32_e32 v44, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:52 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:68 ; GFX9-NEXT: v_mov_b32_e32 v37, v30 ; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 @@ -57903,7 +57765,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v25 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v27 ; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(19) @@ -57911,256 +57773,248 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v4 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v24 ; GFX9-NEXT: s_cbranch_scc0 .LBB87_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: s_and_b32 s4, s16, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s17, 8 +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s19, 8 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s6, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s21, 8 ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s7, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s8 ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s8, s4, s5 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v43, v44 ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: s_cbranch_execnz .LBB87_3 +; GFX9-NEXT: .LBB87_2: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: s_add_i32 s16, s16, 3 +; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s19, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s6 +; GFX9-NEXT: s_and_b32 s6, s18, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s19, 8 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: s_addk_i32 s6, 0x300 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_lshl_b32 s5, s5, 16 -; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s23, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_add_i32 s20, s20, 3 +; GFX9-NEXT: s_or_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s6, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s21, 8 +; GFX9-NEXT: s_add_i32 s22, s22, 3 +; GFX9-NEXT: s_or_b32 s6, s7, s6 +; GFX9-NEXT: s_and_b32 s7, s22, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s23, 8 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_addk_i32 s6, 0x300 +; GFX9-NEXT: s_addk_i32 s7, 0x300 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v28, v44 -; GFX9-NEXT: v_mov_b32_e32 v33, v42 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_cbranch_execnz .LBB87_3 -; GFX9-NEXT: .LBB87_2: ; %cmp.true -; GFX9-NEXT: v_add_u32_e32 v1, 3, v28 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v26 -; GFX9-NEXT: s_movk_i32 s4, 0x300 -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: s_and_b32 s7, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s25, 8 +; GFX9-NEXT: s_add_i32 s26, s26, 3 +; GFX9-NEXT: s_or_b32 s7, s8, s7 +; GFX9-NEXT: s_and_b32 s8, s26, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s27, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 +; GFX9-NEXT: s_and_b32 s7, s7, 0xffff +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 ; GFX9-NEXT: s_add_i32 s28, s28, 3 -; GFX9-NEXT: s_and_b32 s5, s28, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s29, 8 -; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 -; GFX9-NEXT: s_addk_i32 s5, 0x300 +; GFX9-NEXT: s_or_b32 s7, s7, s8 +; GFX9-NEXT: s_and_b32 s8, s28, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s29, 8 +; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 +; GFX9-NEXT: s_movk_i32 s5, 0x300 +; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: v_add_u32_sdwa v0, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v51 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v50 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 ; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 ; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v35 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 ; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v40 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v31 ; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_add_i32 s16, s16, 3 -; GFX9-NEXT: s_and_b32 s5, s16, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s17, 8 -; GFX9-NEXT: s_add_i32 s18, s18, 3 -; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: s_and_b32 s6, s18, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s19, 8 -; GFX9-NEXT: s_or_b32 s6, s7, s6 -; GFX9-NEXT: s_addk_i32 s5, 0x300 -; GFX9-NEXT: s_addk_i32 s6, 0x300 -; GFX9-NEXT: s_and_b32 s5, s5, 0xffff -; GFX9-NEXT: s_lshl_b32 s6, s6, 16 -; GFX9-NEXT: s_add_i32 s20, s20, 3 -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s21, 8 -; GFX9-NEXT: s_add_i32 s22, s22, 3 -; GFX9-NEXT: s_or_b32 s6, s7, s6 -; GFX9-NEXT: s_and_b32 s7, s22, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s23, 8 -; GFX9-NEXT: s_or_b32 s7, s8, s7 -; GFX9-NEXT: s_addk_i32 s6, 0x300 -; GFX9-NEXT: s_addk_i32 s7, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 -; GFX9-NEXT: s_and_b32 s6, s6, 0xffff -; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_add_i32 s24, s24, 3 -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s8, s25, 8 -; GFX9-NEXT: s_add_i32 s26, s26, 3 -; GFX9-NEXT: s_or_b32 s7, s8, s7 -; GFX9-NEXT: s_and_b32 s8, s26, 0xff -; GFX9-NEXT: s_lshl_b32 s9, s27, 8 -; GFX9-NEXT: s_or_b32 s8, s9, s8 -; GFX9-NEXT: s_addk_i32 s7, 0x300 -; GFX9-NEXT: s_addk_i32 s8, 0x300 -; GFX9-NEXT: s_and_b32 s7, s7, 0xffff -; GFX9-NEXT: s_lshl_b32 s8, s8, 16 -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v52 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: .LBB87_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -58181,9 +58035,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB87_4: -; GFX9-NEXT: v_mov_b32_e32 v28, v44 -; GFX9-NEXT: v_mov_b32_e32 v26, v4 -; GFX9-NEXT: v_mov_b32_e32 v33, v42 +; GFX9-NEXT: v_mov_b32_e32 v43, v44 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB87_2 ; @@ -60101,179 +59953,174 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v21, v16 ; SI-NEXT: v_mov_b32_e32 v25, v15 -; SI-NEXT: v_mov_b32_e32 v26, v12 -; SI-NEXT: v_mov_b32_e32 v29, v11 -; SI-NEXT: v_mov_b32_e32 v22, v8 -; SI-NEXT: v_mov_b32_e32 v30, v7 -; SI-NEXT: v_mov_b32_e32 v32, v4 -; SI-NEXT: v_mov_b32_e32 v33, v3 -; SI-NEXT: v_mov_b32_e32 v34, v0 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB91_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB91_3 ; SI-NEXT: .LBB91_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_or_b32_e32 v14, v14, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_or_b32_e32 v18, v18, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_or_b32_e32 v22, v22, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 -; SI-NEXT: v_or_b32_e32 v26, v26, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v33 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_or_b32_e32 v30, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v49 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 +; SI-NEXT: v_or_b32_e32 v26, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v22, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v18, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v14, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v10, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_or_b32_e32 v6, v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v2, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[32:33], v[1:2], 16 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshr_b64 v[33:34], v[13:14], 16 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; SI-NEXT: v_lshr_b64 v[48:49], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v30, v30, v32 +; SI-NEXT: v_lshr_b64 v[35:36], v[21:22], 16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshr_b64 v[37:38], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[29:30], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 @@ -60283,14 +60130,14 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v24, v24, v25 ; SI-NEXT: v_or_b32_e32 v28, v28, v29 ; SI-NEXT: .LBB91_3: ; %end -; SI-NEXT: v_mov_b32_e32 v1, v37 -; SI-NEXT: v_mov_b32_e32 v5, v35 -; SI-NEXT: v_mov_b32_e32 v9, v33 -; SI-NEXT: v_mov_b32_e32 v13, v38 -; SI-NEXT: v_mov_b32_e32 v17, v48 -; SI-NEXT: v_mov_b32_e32 v21, v49 -; SI-NEXT: v_mov_b32_e32 v25, v52 -; SI-NEXT: v_mov_b32_e32 v29, v50 +; SI-NEXT: v_mov_b32_e32 v1, v32 +; SI-NEXT: v_mov_b32_e32 v5, v48 +; SI-NEXT: v_mov_b32_e32 v9, v49 +; SI-NEXT: v_mov_b32_e32 v13, v33 +; SI-NEXT: v_mov_b32_e32 v17, v34 +; SI-NEXT: v_mov_b32_e32 v21, v35 +; SI-NEXT: v_mov_b32_e32 v25, v36 +; SI-NEXT: v_mov_b32_e32 v29, v37 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB91_4: ; SI-NEXT: s_branch .LBB91_2 @@ -60843,20 +60690,20 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v20 ; SI-NEXT: s_cbranch_scc0 .LBB93_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_lshl_b32 s14, s16, 16 -; SI-NEXT: s_lshl_b32 s15, s17, 16 -; SI-NEXT: s_lshl_b32 s40, s18, 16 -; SI-NEXT: s_lshl_b32 s41, s19, 16 -; SI-NEXT: s_lshl_b32 s42, s20, 16 -; SI-NEXT: s_lshl_b32 s43, s21, 16 -; SI-NEXT: s_lshl_b32 s6, s22, 16 -; SI-NEXT: s_lshl_b32 s7, s23, 16 -; SI-NEXT: s_lshl_b32 s8, s24, 16 -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: s_lshl_b32 s10, s26, 16 -; SI-NEXT: s_lshl_b32 s11, s27, 16 -; SI-NEXT: s_lshl_b32 s12, s28, 16 -; SI-NEXT: s_lshl_b32 s13, s29, 16 +; SI-NEXT: s_lshl_b32 s6, s16, 16 +; SI-NEXT: s_lshl_b32 s7, s17, 16 +; SI-NEXT: s_lshl_b32 s8, s18, 16 +; SI-NEXT: s_lshl_b32 s9, s19, 16 +; SI-NEXT: s_lshl_b32 s10, s20, 16 +; SI-NEXT: s_lshl_b32 s11, s21, 16 +; SI-NEXT: s_lshl_b32 s12, s22, 16 +; SI-NEXT: s_lshl_b32 s13, s23, 16 +; SI-NEXT: s_lshl_b32 s14, s24, 16 +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_lshl_b32 s40, s26, 16 +; SI-NEXT: s_lshl_b32 s41, s27, 16 +; SI-NEXT: s_lshl_b32 s42, s28, 16 +; SI-NEXT: s_lshl_b32 s43, s29, 16 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 @@ -60881,14 +60728,19 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; SI-NEXT: s_lshl_b32 s7, s25, 16 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s8, s6, 0x30000 +; SI-NEXT: s_add_i32 s14, s6, 0x30000 ; SI-NEXT: s_and_b32 s6, s22, 0xffff ; SI-NEXT: s_lshl_b32 s7, s23, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s12, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s20, 0xffff -; SI-NEXT: s_lshl_b32 s9, s21, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s10, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s18, 0xffff +; SI-NEXT: s_lshl_b32 s7, s19, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v32 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v12 @@ -60898,9 +60750,7 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: s_and_b32 s9, s18, 0xffff -; SI-NEXT: s_lshl_b32 s10, s19, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 @@ -60911,9 +60761,9 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s16, 0xffff -; SI-NEXT: s_lshl_b32 s11, s17, 16 +; SI-NEXT: s_add_i32 s8, s6, 0x30000 +; SI-NEXT: s_and_b32 s6, s16, 0xffff +; SI-NEXT: s_lshl_b32 s7, s17, 16 ; SI-NEXT: v_or_b32_e32 v1, v31, v1 ; SI-NEXT: v_or_b32_e32 v3, v29, v3 ; SI-NEXT: v_or_b32_e32 v5, v27, v5 @@ -60923,7 +60773,7 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; SI-NEXT: v_or_b32_e32 v4, v19, v4 ; SI-NEXT: v_or_b32_e32 v2, v17, v2 ; SI-NEXT: v_or_b32_e32 v0, v15, v0 -; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 @@ -60936,23 +60786,20 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: s_and_b32 s15, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s14, s10, 16 -; SI-NEXT: s_and_b32 s41, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s40, s9, 16 -; SI-NEXT: s_and_b32 s43, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s42, s7, 16 ; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s11, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s10, s5, 16 -; SI-NEXT: s_and_b32 s13, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s12, s4, 16 +; SI-NEXT: s_and_b32 s11, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_and_b32 s15, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_and_b32 s41, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s5, 16 +; SI-NEXT: s_and_b32 s43, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s42, s4, 16 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v0 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 @@ -60972,28 +60819,22 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v1 ; SI-NEXT: .LBB93_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_mov_b32_e32 v2, s40 -; SI-NEXT: v_mov_b32_e32 v3, s41 -; SI-NEXT: v_mov_b32_e32 v4, s42 -; SI-NEXT: v_mov_b32_e32 v5, s43 -; SI-NEXT: v_mov_b32_e32 v6, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v8, s8 -; SI-NEXT: v_mov_b32_e32 v9, s9 -; SI-NEXT: v_mov_b32_e32 v10, s10 -; SI-NEXT: v_mov_b32_e32 v11, s11 -; SI-NEXT: v_mov_b32_e32 v12, s12 -; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s40 +; SI-NEXT: v_mov_b32_e32 v11, s41 +; SI-NEXT: v_mov_b32_e32 v12, s42 +; SI-NEXT: v_mov_b32_e32 v13, s43 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB93_4: -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr8 @@ -61002,6 +60843,12 @@ define inreg <32 x bfloat> @bitcast_v32i16_to_v32bf16_scalar(<32 x i16> inreg %a ; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr18 @@ -62682,11 +62529,11 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mul_f32_e64 v62, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v61, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v60, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v58, 1.0, v6 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v7 @@ -62700,16 +62547,16 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 ; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s20 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s22 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v63, 1.0, s24 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v47, 1.0, s26 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28 ; SI-NEXT: v_mul_f32_e64 v13, 1.0, s29 @@ -62718,46 +62565,56 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v61 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v63 ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v50 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v60 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v58 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v57 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v56 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_lshr_b64 v[38:39], v[38:39], 16 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v39 ; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 @@ -62799,75 +62656,65 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v61 ; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 -; SI-NEXT: v_lshr_b64 v[38:39], v[38:39], 16 ; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshr_b64 v[18:19], v[38:39], 16 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; SI-NEXT: v_lshr_b64 v[2:3], v[54:55], 16 ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_lshr_b64 v[6:7], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[6:7], v[52:53], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 -; SI-NEXT: v_lshr_b64 v[10:11], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[50:51], 16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 +; SI-NEXT: v_lshr_b64 v[39:40], v[1:2], 16 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 -; SI-NEXT: v_lshr_b64 v[14:15], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[14:15], v[48:49], 16 ; SI-NEXT: v_lshr_b64 v[40:41], v[5:6], 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_lshr_b64 v[18:19], v[37:38], 16 ; SI-NEXT: v_lshr_b64 v[41:42], v[9:10], 16 ; SI-NEXT: v_lshr_b64 v[22:23], v[35:36], 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshr_b64 v[42:43], v[13:14], 16 ; SI-NEXT: v_lshr_b64 v[26:27], v[33:34], 16 -; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v0 ; SI-NEXT: v_lshr_b64 v[43:44], v[17:18], 16 ; SI-NEXT: v_lshr_b64 v[30:31], v[31:32], 16 -; SI-NEXT: v_lshr_b64 v[2:3], v[52:53], 16 ; SI-NEXT: v_lshr_b64 v[44:45], v[21:22], 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_lshr_b64 v[45:46], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[54:55], v[1:2], 16 ; SI-NEXT: v_lshr_b64 v[46:47], v[29:30], 16 ; SI-NEXT: .LBB95_3: ; %end ; SI-NEXT: v_mov_b32_e32 v5, v40 @@ -62897,12 +62744,12 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, v54 -; SI-NEXT: v_mov_b32_e32 v3, v53 -; SI-NEXT: v_mov_b32_e32 v7, v51 -; SI-NEXT: v_mov_b32_e32 v11, v49 -; SI-NEXT: v_mov_b32_e32 v15, v38 -; SI-NEXT: v_mov_b32_e32 v19, v39 +; SI-NEXT: v_mov_b32_e32 v1, v39 +; SI-NEXT: v_mov_b32_e32 v3, v55 +; SI-NEXT: v_mov_b32_e32 v7, v53 +; SI-NEXT: v_mov_b32_e32 v11, v51 +; SI-NEXT: v_mov_b32_e32 v15, v49 +; SI-NEXT: v_mov_b32_e32 v19, v38 ; SI-NEXT: v_mov_b32_e32 v23, v36 ; SI-NEXT: v_mov_b32_e32 v27, v34 ; SI-NEXT: v_mov_b32_e32 v31, v32 @@ -62912,365 +62759,362 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v32bf16_to_v32i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v19, s23 +; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v18, s25 -; VI-NEXT: v_mov_b32_e32 v17, s27 -; VI-NEXT: v_mov_b32_e32 v16, s29 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB95_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB95_3 ; VI-NEXT: .LBB95_2: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v18, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; VI-NEXT: v_bfe_u32 v19, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v2 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v20, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v19, v19, v20, vcc +; VI-NEXT: v_bfe_u32 v20, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v2 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 -; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v20, v21, vcc +; VI-NEXT: v_bfe_u32 v20, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc +; VI-NEXT: v_bfe_u32 v21, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v22, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_bfe_u32 v7, v4, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v24, v9, v11, vcc -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v4 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v21, v22, vcc +; VI-NEXT: v_bfe_u32 v21, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v4 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v22, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v21, v21, v22, vcc +; VI-NEXT: v_bfe_u32 v22, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v4 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v20 -; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v4, v22, v23, vcc +; VI-NEXT: v_bfe_u32 v22, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v5 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v22, v22, v23, vcc +; VI-NEXT: v_bfe_u32 v23, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v5 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v24, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_bfe_u32 v9, v6, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_cndmask_b32_e32 v5, v23, v24, vcc +; VI-NEXT: v_bfe_u32 v23, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v6 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v24, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc +; VI-NEXT: v_bfe_u32 v24, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v6 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v19 -; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 -; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v6, v24, v25, vcc +; VI-NEXT: v_bfe_u32 v24, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v7 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v24, v24, v25, vcc +; VI-NEXT: v_bfe_u32 v25, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v7 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_bfe_u32 v11, v8, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v26, v13, v19, vcc -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v8 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; VI-NEXT: v_cndmask_b32_e32 v7, v25, v26, vcc +; VI-NEXT: v_bfe_u32 v25, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v8 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v9 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc +; VI-NEXT: v_bfe_u32 v26, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v8 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v13, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v18 -; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v13, v19, vcc -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 -; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_bfe_u32 v18, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v13 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v8, v26, v27, vcc +; VI-NEXT: v_bfe_u32 v26, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v9 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v26, v26, v27, vcc +; VI-NEXT: v_bfe_u32 v27, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v9 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_bfe_u32 v13, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_cndmask_b32_e32 v9, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v10 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 ; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 -; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v13, v13, v28, vcc -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v31, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v10 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v11, v28, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17 -; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_bfe_u32 v28, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v13 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v13, v28, v29, vcc -; VI-NEXT: v_bfe_u32 v28, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v17 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v28, v28, v29, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v11 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v30, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v11 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_bfe_u32 v17, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v12 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_cndmask_b32_e32 v11, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v12 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v13 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v17, v17, v30, vcc -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v32, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v12 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v30, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_bfe_u32 v30, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v17 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_or_b32_e32 v31, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v17, v30, v31, vcc -; VI-NEXT: v_bfe_u32 v30, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v16 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 -; VI-NEXT: v_or_b32_e32 v31, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v30, v31, vcc -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v12, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v13 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v29, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v13 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_bfe_u32 v31, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc -; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v15 -; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_bfe_u32 v32, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v14 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cndmask_b32_e32 v13, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v14 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[24:25] -; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] -; VI-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 -; VI-NEXT: v_mov_b32_e32 v21, v23 -; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[26:27] -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cndmask_b32_e32 v33, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v14 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_mov_b32_e32 v19, v23 -; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[28:29] -; VI-NEXT: v_cndmask_b32_e32 v14, v15, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v32 -; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] -; VI-NEXT: v_mov_b32_e32 v17, v23 -; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[14:15] -; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[30:31] -; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[12:13] -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9] -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: v_mov_b32_e32 v15, v23 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v14, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v15 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v27, v28, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[27:28], 16, v[14:15] +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[13:14] +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[12:13] +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v32 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v30 +; VI-NEXT: v_lshrrev_b64 v[29:30], 16, v[10:11] +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v31 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v26 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v25 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v24 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[6:7] +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v23 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v21 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[16:17] +; VI-NEXT: v_mov_b32_e32 v1, v18 +; VI-NEXT: v_mov_b32_e32 v3, v20 +; VI-NEXT: v_mov_b32_e32 v5, v22 +; VI-NEXT: v_mov_b32_e32 v7, v24 +; VI-NEXT: v_mov_b32_e32 v9, v30 +; VI-NEXT: v_mov_b32_e32 v11, v29 +; VI-NEXT: v_mov_b32_e32 v13, v28 +; VI-NEXT: v_mov_b32_e32 v15, v27 ; VI-NEXT: .LBB95_3: ; %end -; VI-NEXT: v_mov_b32_e32 v1, v22 -; VI-NEXT: v_mov_b32_e32 v3, v21 -; VI-NEXT: v_mov_b32_e32 v5, v20 -; VI-NEXT: v_mov_b32_e32 v7, v19 -; VI-NEXT: v_mov_b32_e32 v9, v18 -; VI-NEXT: v_mov_b32_e32 v11, v17 -; VI-NEXT: v_mov_b32_e32 v13, v16 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB95_4: ; VI-NEXT: s_branch .LBB95_2 @@ -64249,9 +64093,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -64356,16 +64200,16 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr62 @@ -64380,7 +64224,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB96_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v56, v1, v63 +; SI-NEXT: v_or_b32_e32 v56, v1, v35 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v47, v1, v27 ; SI-NEXT: v_alignbit_b32 v1, v47, v56, 24 @@ -64454,7 +64298,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v14, v1, v41 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v60, v18, v22, 24 +; SI-NEXT: v_alignbit_b32 v59, v18, v22, 24 ; SI-NEXT: v_alignbit_b32 v61, v18, v22, 16 ; SI-NEXT: v_bfe_u32 v62, v44, 8, 8 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -64469,7 +64313,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 @@ -64490,34 +64334,31 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v6, v1, v43 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 ; SI-NEXT: v_or_b32_e32 v2, v1, v42 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -64535,13 +64376,13 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v12, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v16, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v20, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v24, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill @@ -64549,27 +64390,30 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_bfe_u32 v1, v28, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 -; SI-NEXT: v_alignbit_b32 v57, v10, v14, 24 +; SI-NEXT: v_alignbit_b32 v46, v10, v14, 24 ; SI-NEXT: v_alignbit_b32 v58, v10, v14, 16 ; SI-NEXT: v_alignbit_b32 v45, v2, v6, 24 -; SI-NEXT: v_alignbit_b32 v46, v2, v6, 16 -; SI-NEXT: v_alignbit_b32 v59, v2, v6, 8 +; SI-NEXT: v_alignbit_b32 v57, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v60, v2, v6, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 @@ -64596,7 +64440,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v1, v35, v1 ; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -64611,13 +64455,13 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v43, v2 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v42, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_alignbit_b32 v45, v2, v6, 24 -; SI-NEXT: v_alignbit_b32 v46, v2, v6, 16 -; SI-NEXT: v_alignbit_b32 v59, v2, v6, 8 +; SI-NEXT: v_alignbit_b32 v57, v2, v6, 16 +; SI-NEXT: v_alignbit_b32 v60, v2, v6, 8 ; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 @@ -64693,43 +64537,43 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v38 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v18 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill @@ -64738,7 +64582,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v10 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -64747,7 +64591,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_alignbit_b32 v4, v47, v56, 24 @@ -64756,10 +64600,10 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v12, v26, v30, 24 ; SI-NEXT: v_alignbit_b32 v16, v26, v30, 16 ; SI-NEXT: v_alignbit_b32 v44, v26, v30, 8 -; SI-NEXT: v_alignbit_b32 v60, v18, v22, 24 +; SI-NEXT: v_alignbit_b32 v59, v18, v22, 24 ; SI-NEXT: v_alignbit_b32 v61, v18, v22, 16 ; SI-NEXT: v_alignbit_b32 v20, v18, v22, 8 -; SI-NEXT: v_alignbit_b32 v57, v10, v14, 24 +; SI-NEXT: v_alignbit_b32 v46, v10, v14, 24 ; SI-NEXT: v_alignbit_b32 v58, v10, v14, 16 ; SI-NEXT: v_alignbit_b32 v8, v10, v14, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill @@ -64784,7 +64628,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 @@ -64820,14 +64664,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -64856,14 +64700,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -64888,17 +64732,17 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -64918,14 +64762,14 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -64942,13 +64786,13 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v61 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v60 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v59 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -64972,13 +64816,13 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v57 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v46 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload @@ -64998,9 +64842,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v60 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v57 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v45 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -65045,28 +64889,26 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; ; VI-LABEL: bitcast_v32i16_to_v64i8: ; VI: ; %bb.0: -; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; kill: killed $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; kill: killed $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; kill: killed $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; kill: killed $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -65083,140 +64925,144 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; kill: killed $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; kill: killed $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; kill: killed $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; kill: killed $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; kill: killed $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr18 -; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; kill: killed $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB96_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v15 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v14 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v14 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[15:16] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v13 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v12 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v12 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[13:14] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[13:14] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v11 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v10 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[11:12] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v9 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v8 -; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v8 -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v7 -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v6 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v6 -; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v16 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[7:8] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v5 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v19, 24, v14 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v4 +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[26:27], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[15:16] +; VI-NEXT: v_lshrrev_b32_e32 v18, 24, v4 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 -; VI-NEXT: v_mov_b32_e32 v25, v50 -; VI-NEXT: v_mov_b32_e32 v41, v1 -; VI-NEXT: v_mov_b32_e32 v54, v2 -; VI-NEXT: v_mov_b32_e32 v57, v3 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v61, v5 -; VI-NEXT: v_mov_b32_e32 v60, v6 -; VI-NEXT: v_mov_b32_e32 v52, v7 -; VI-NEXT: v_mov_b32_e32 v63, v8 -; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; VI-NEXT: v_mov_b32_e32 v27, v19 +; VI-NEXT: v_mov_b32_e32 v43, v1 +; VI-NEXT: v_mov_b32_e32 v42, v2 +; VI-NEXT: v_mov_b32_e32 v47, v3 +; VI-NEXT: v_mov_b32_e32 v45, v4 +; VI-NEXT: v_mov_b32_e32 v59, v5 +; VI-NEXT: v_mov_b32_e32 v58, v6 +; VI-NEXT: v_mov_b32_e32 v63, v7 +; VI-NEXT: v_mov_b32_e32 v61, v8 +; VI-NEXT: v_mov_b32_e32 v54, v9 ; VI-NEXT: v_mov_b32_e32 v53, v10 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v44, v12 -; VI-NEXT: v_mov_b32_e32 v58, v13 -; VI-NEXT: v_mov_b32_e32 v56, v14 -; VI-NEXT: v_mov_b32_e32 v50, v15 +; VI-NEXT: v_mov_b32_e32 v44, v11 +; VI-NEXT: v_mov_b32_e32 v17, v12 +; VI-NEXT: v_mov_b32_e32 v19, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v14 +; VI-NEXT: v_mov_b32_e32 v57, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v15 +; VI-NEXT: v_mov_b32_e32 v52, v15 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v16 ; VI-NEXT: v_mov_b32_e32 v62, v16 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr3 @@ -65231,267 +65077,268 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_cbranch_execz .LBB96_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v18, 3 -; VI-NEXT: v_add_u16_sdwa v26, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v29, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v62, 3, v16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 -; VI-NEXT: v_add_u16_e32 v50, 3, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; VI-NEXT: v_or_b32_e32 v16, v62, v16 -; VI-NEXT: v_or_b32_e32 v15, v50, v15 -; VI-NEXT: v_add_u16_sdwa v38, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v49, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v36, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v48, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v34, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v39, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v32, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v37, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v30, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v35, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v28, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v33, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v27, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_sdwa v31, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[15:16] -; VI-NEXT: v_add_u16_e32 v56, 3, v14 -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 -; VI-NEXT: v_add_u16_e32 v58, 3, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 -; VI-NEXT: v_or_b32_e32 v14, v56, v14 -; VI-NEXT: v_or_b32_e32 v13, v58, v13 -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] -; VI-NEXT: v_add_u16_e32 v44, 3, v12 -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 -; VI-NEXT: v_add_u16_e32 v17, 3, v11 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; VI-NEXT: v_add_u16_sdwa v29, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v33, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v57, 3, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v29 +; VI-NEXT: v_add_u16_e32 v19, 3, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 +; VI-NEXT: v_or_b32_e32 v14, v57, v14 +; VI-NEXT: v_or_b32_e32 v13, v19, v13 +; VI-NEXT: v_add_u16_sdwa v30, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v35, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[13:14] +; VI-NEXT: v_add_u16_e32 v17, 3, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v30 +; VI-NEXT: v_add_u16_e32 v44, 3, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; VI-NEXT: v_or_b32_e32 v12, v17, v12 +; VI-NEXT: v_or_b32_e32 v11, v44, v11 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_sdwa v32, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v37, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[11:12] ; VI-NEXT: v_add_u16_e32 v53, 3, v10 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 -; VI-NEXT: v_add_u16_e32 v40, 3, v9 -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 -; VI-NEXT: v_or_b32_e32 v12, v44, v12 -; VI-NEXT: v_or_b32_e32 v11, v17, v11 -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_add_u16_e32 v63, 3, v8 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; VI-NEXT: v_add_u16_e32 v52, 3, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; VI-NEXT: v_add_u16_e32 v54, 3, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 ; VI-NEXT: v_or_b32_e32 v10, v53, v10 -; VI-NEXT: v_or_b32_e32 v9, v40, v9 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_add_u16_e32 v60, 3, v6 -; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 -; VI-NEXT: v_add_u16_e32 v61, 3, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 -; VI-NEXT: v_or_b32_e32 v8, v63, v8 -; VI-NEXT: v_or_b32_e32 v7, v52, v7 +; VI-NEXT: v_or_b32_e32 v9, v54, v9 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_sdwa v48, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v51, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v34, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v39, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_add_u16_e32 v47, 3, v4 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; VI-NEXT: v_add_u16_e32 v57, 3, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; VI-NEXT: v_or_b32_e32 v6, v60, v6 -; VI-NEXT: v_or_b32_e32 v5, v61, v5 -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; VI-NEXT: v_add_u16_e32 v54, 3, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; VI-NEXT: v_add_u16_e32 v41, 3, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; VI-NEXT: v_or_b32_e32 v4, v47, v4 -; VI-NEXT: v_or_b32_e32 v3, v57, v3 -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] -; VI-NEXT: v_or_b32_e32 v2, v54, v2 -; VI-NEXT: v_or_b32_e32 v1, v41, v1 -; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; VI-NEXT: v_bfe_u32 v1, v27, 8, 8 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v28, 8, 8 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v42, 3, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; VI-NEXT: v_add_u16_e32 v43, 3, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 +; VI-NEXT: v_add_u16_sdwa v36, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v49, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v61, 3, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v34 +; VI-NEXT: v_add_u16_e32 v63, 3, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 +; VI-NEXT: v_or_b32_e32 v2, v42, v2 +; VI-NEXT: v_or_b32_e32 v1, v43, v1 +; VI-NEXT: v_add_u16_sdwa v38, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v50, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v58, 3, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; VI-NEXT: v_add_u16_e32 v59, 3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; VI-NEXT: v_or_b32_e32 v8, v61, v8 +; VI-NEXT: v_or_b32_e32 v7, v63, v7 +; VI-NEXT: v_add_u16_sdwa v28, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v31, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v45, 3, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 +; VI-NEXT: v_add_u16_e32 v47, 3, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; VI-NEXT: v_or_b32_e32 v6, v58, v6 +; VI-NEXT: v_or_b32_e32 v5, v59, v5 +; VI-NEXT: v_add_u16_e32 v62, 3, v16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v28 +; VI-NEXT: v_add_u16_e32 v52, 3, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[26:27], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v1 ; VI-NEXT: v_bfe_u32 v1, v30, 8, 8 -; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v2 -; VI-NEXT: v_bfe_u32 v25, v26, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v4, v45, v4 +; VI-NEXT: v_or_b32_e32 v3, v47, v3 +; VI-NEXT: v_or_b32_e32 v16, v62, v16 +; VI-NEXT: v_or_b32_e32 v15, v52, v15 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v32, 8, 8 -; VI-NEXT: v_bfe_u32 v43, v34, 8, 8 -; VI-NEXT: v_bfe_u32 v46, v36, 8, 8 -; VI-NEXT: v_bfe_u32 v59, v38, 8, 8 -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v34, 8, 8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v6, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v2 +; VI-NEXT: v_bfe_u32 v23, v28, 8, 8 +; VI-NEXT: v_bfe_u32 v27, v29, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v36, 8, 8 +; VI-NEXT: v_bfe_u32 v18, v38, 8, 8 +; VI-NEXT: v_bfe_u32 v60, v48, 8, 8 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: .LBB96_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v18 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 -; VI-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v60 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v23 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v46 -; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 -; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v21 +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 -; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v21 -; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25 -; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v23 ; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -65516,9 +65363,7 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; kill: killed $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -65535,108 +65380,107 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr27 -; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB96_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v16 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v16 ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v26, v23 -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX9-NEXT: .LBB96_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB96_4 @@ -65655,158 +65499,155 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] ; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 24, v16 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX9-NEXT: .LBB96_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v27, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v26, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v46 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v42 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21 ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54 ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v49 ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v48 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19 ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v31 ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v30 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 ; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v26 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -66448,436 +66289,385 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v20, s98, 34 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; SI-NEXT: v_writelane_b32 v20, s99, 35 -; SI-NEXT: s_mov_b32 s93, s18 -; SI-NEXT: s_mov_b32 s31, s17 -; SI-NEXT: v_readfirstlane_b32 s59, v18 -; SI-NEXT: v_readfirstlane_b32 s18, v17 -; SI-NEXT: v_readfirstlane_b32 s63, v16 -; SI-NEXT: v_readfirstlane_b32 s17, v15 -; SI-NEXT: v_readfirstlane_b32 s72, v14 -; SI-NEXT: v_readfirstlane_b32 s76, v13 -; SI-NEXT: v_readfirstlane_b32 s57, v12 -; SI-NEXT: v_readfirstlane_b32 s61, v11 -; SI-NEXT: v_readfirstlane_b32 s44, v10 -; SI-NEXT: v_readfirstlane_b32 s58, v9 -; SI-NEXT: v_readfirstlane_b32 s62, v8 -; SI-NEXT: v_readfirstlane_b32 s45, v7 -; SI-NEXT: v_readfirstlane_b32 s96, v6 -; SI-NEXT: v_readfirstlane_b32 s97, v5 -; SI-NEXT: v_readfirstlane_b32 s99, v4 -; SI-NEXT: v_readfirstlane_b32 s46, v3 -; SI-NEXT: v_readfirstlane_b32 s83, v2 +; SI-NEXT: v_readfirstlane_b32 s97, v18 +; SI-NEXT: v_readfirstlane_b32 s99, v17 +; SI-NEXT: v_readfirstlane_b32 s44, v16 +; SI-NEXT: v_readfirstlane_b32 s46, v15 +; SI-NEXT: v_readfirstlane_b32 s85, v14 +; SI-NEXT: v_readfirstlane_b32 s87, v13 +; SI-NEXT: v_readfirstlane_b32 s96, v12 +; SI-NEXT: v_readfirstlane_b32 s98, v11 +; SI-NEXT: v_readfirstlane_b32 s81, v10 +; SI-NEXT: v_readfirstlane_b32 s83, v9 +; SI-NEXT: v_readfirstlane_b32 s84, v8 +; SI-NEXT: v_readfirstlane_b32 s86, v7 +; SI-NEXT: v_readfirstlane_b32 s70, v6 +; SI-NEXT: v_readfirstlane_b32 s71, v5 +; SI-NEXT: v_readfirstlane_b32 s80, v4 +; SI-NEXT: v_readfirstlane_b32 s82, v3 +; SI-NEXT: v_readfirstlane_b32 s68, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s85, v1 +; SI-NEXT: v_readfirstlane_b32 s69, v1 ; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s31, 16 -; SI-NEXT: s_or_b32 s40, s4, s5 -; SI-NEXT: s_and_b32 s4, s93, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff ; SI-NEXT: s_lshl_b32 s5, s19, 16 -; SI-NEXT: s_or_b32 s41, s4, s5 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 24 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v21, s4, 4 -; SI-NEXT: v_writelane_b32 v21, s5, 5 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 ; SI-NEXT: v_writelane_b32 v21, s4, 2 ; SI-NEXT: v_writelane_b32 v21, s5, 3 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 16 ; SI-NEXT: v_writelane_b32 v21, s4, 0 ; SI-NEXT: v_writelane_b32 v21, s5, 1 ; SI-NEXT: s_and_b32 s4, s20, 0xffff ; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_or_b32 s40, s4, s5 ; SI-NEXT: s_and_b32 s4, s22, 0xffff ; SI-NEXT: s_lshl_b32 s5, s23, 16 -; SI-NEXT: s_or_b32 s15, s4, s5 -; SI-NEXT: s_lshr_b64 s[4:5], s[14:15], 24 -; SI-NEXT: v_writelane_b32 v21, s4, 10 -; SI-NEXT: v_writelane_b32 v21, s5, 11 -; SI-NEXT: s_lshr_b64 s[4:5], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v21, s4, 8 -; SI-NEXT: v_writelane_b32 v21, s5, 9 -; SI-NEXT: s_lshr_b64 s[4:5], s[14:15], 8 -; SI-NEXT: v_writelane_b32 v21, s4, 6 -; SI-NEXT: v_writelane_b32 v21, s5, 7 +; SI-NEXT: s_or_b32 s41, s4, s5 ; SI-NEXT: s_and_b32 s4, s24, 0xffff ; SI-NEXT: s_lshl_b32 s5, s25, 16 -; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_or_b32 s14, s4, s5 ; SI-NEXT: s_and_b32 s4, s26, 0xffff ; SI-NEXT: s_lshl_b32 s5, s27, 16 -; SI-NEXT: s_or_b32 s11, s4, s5 -; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 24 -; SI-NEXT: v_writelane_b32 v21, s4, 16 -; SI-NEXT: v_writelane_b32 v21, s5, 17 -; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 16 -; SI-NEXT: v_writelane_b32 v21, s4, 14 -; SI-NEXT: v_writelane_b32 v21, s5, 15 -; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 8 -; SI-NEXT: v_writelane_b32 v21, s4, 12 -; SI-NEXT: v_writelane_b32 v21, s5, 13 +; SI-NEXT: s_or_b32 s15, s4, s5 ; SI-NEXT: s_and_b32 s4, s28, 0xffff ; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: s_or_b32 s42, s4, s5 -; SI-NEXT: s_and_b32 s4, s85, 0xffff -; SI-NEXT: s_lshl_b32 s5, s83, 16 -; SI-NEXT: s_or_b32 s43, s4, s5 -; SI-NEXT: s_and_b32 s4, s46, 0xffff -; SI-NEXT: s_lshl_b32 s5, s99, 16 ; SI-NEXT: s_or_b32 s12, s4, s5 -; SI-NEXT: s_and_b32 s4, s97, 0xffff -; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: s_and_b32 s4, s69, 0xffff +; SI-NEXT: s_lshl_b32 s5, s68, 16 ; SI-NEXT: s_or_b32 s13, s4, s5 -; SI-NEXT: s_and_b32 s4, s45, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 +; SI-NEXT: s_and_b32 s4, s82, 0xffff +; SI-NEXT: s_lshl_b32 s5, s80, 16 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s71, 0xffff +; SI-NEXT: s_lshl_b32 s5, s70, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s86, 0xffff +; SI-NEXT: s_lshl_b32 s5, s84, 16 ; SI-NEXT: s_or_b32 s8, s4, s5 -; SI-NEXT: s_and_b32 s4, s58, 0xffff -; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: s_and_b32 s4, s83, 0xffff +; SI-NEXT: s_lshl_b32 s5, s81, 16 ; SI-NEXT: s_or_b32 s9, s4, s5 -; SI-NEXT: s_and_b32 s4, s61, 0xffff -; SI-NEXT: s_lshl_b32 s5, s57, 16 +; SI-NEXT: s_and_b32 s4, s98, 0xffff +; SI-NEXT: s_lshl_b32 s5, s96, 16 ; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: s_and_b32 s4, s76, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: s_and_b32 s4, s87, 0xffff +; SI-NEXT: s_lshl_b32 s5, s85, 16 ; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s63, 16 -; SI-NEXT: s_and_b32 s78, s72, 0xffff -; SI-NEXT: s_lshr_b64 s[34:35], s[8:9], 24 +; SI-NEXT: s_and_b32 s4, s46, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s47, s59, 16 -; SI-NEXT: s_mov_b32 s35, s78 -; SI-NEXT: s_mov_b32 s78, s93 -; SI-NEXT: s_lshr_b64 s[92:93], s[6:7], 24 -; SI-NEXT: s_or_b32 s5, s5, s47 -; SI-NEXT: s_lshr_b32 s79, s7, 8 -; SI-NEXT: s_mov_b32 s93, s78 -; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 16 -; SI-NEXT: s_mov_b32 s78, s31 -; SI-NEXT: s_lshr_b64 s[30:31], s[6:7], 8 -; SI-NEXT: s_lshr_b32 s88, s5, 8 -; SI-NEXT: s_bfe_u32 s89, s72, 0x80008 +; SI-NEXT: s_and_b32 s5, s99, 0xffff +; SI-NEXT: s_lshl_b32 s45, s97, 16 +; SI-NEXT: s_or_b32 s5, s5, s45 +; SI-NEXT: s_lshr_b64 s[62:63], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[8:9], 24 ; SI-NEXT: s_lshr_b64 s[36:37], s[8:9], 16 -; SI-NEXT: s_mov_b32 s95, s79 -; SI-NEXT: s_mov_b32 s31, s78 -; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 -; SI-NEXT: s_and_b32 s90, s59, 0xffff -; SI-NEXT: s_mov_b32 s37, s89 -; SI-NEXT: s_mov_b32 s79, s88 -; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 -; SI-NEXT: s_bfe_u32 vcc_lo, s59, 0x80008 -; SI-NEXT: s_mov_b32 s89, s90 -; SI-NEXT: s_lshr_b64 s[90:91], s[4:5], 8 -; SI-NEXT: s_lshr_b32 s60, s41, 8 -; SI-NEXT: s_lshr_b32 s87, s15, 8 -; SI-NEXT: s_lshr_b32 s82, s11, 8 -; SI-NEXT: s_lshr_b32 s71, s43, 8 -; SI-NEXT: s_lshr_b32 s68, s13, 8 +; SI-NEXT: s_lshr_b64 s[52:53], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[64:65], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[48:49], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[54:55], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s55, s43, 8 +; SI-NEXT: s_lshr_b32 s49, s41, 8 +; SI-NEXT: s_lshr_b32 s35, s15, 8 +; SI-NEXT: s_lshr_b32 s93, s13, 8 +; SI-NEXT: s_lshr_b32 s79, s11, 8 ; SI-NEXT: s_lshr_b32 s73, s9, 8 -; SI-NEXT: s_and_b32 s74, s19, 0xffff -; SI-NEXT: s_and_b32 s98, s23, 0xffff -; SI-NEXT: s_and_b32 s84, s27, 0xffff -; SI-NEXT: s_and_b32 s80, s83, 0xffff -; SI-NEXT: s_and_b32 s69, s96, 0xffff -; SI-NEXT: s_and_b32 s75, s44, 0xffff -; SI-NEXT: s_bfe_u32 s47, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s56, s23, 0x80008 -; SI-NEXT: s_bfe_u32 s86, s27, 0x80008 -; SI-NEXT: s_bfe_u32 s81, s83, 0x80008 -; SI-NEXT: s_bfe_u32 s70, s96, 0x80008 -; SI-NEXT: s_bfe_u32 s77, s44, 0x80008 -; SI-NEXT: s_lshr_b64 s[54:55], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[48:49], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[50:51], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[38:39], s[8:9], 8 -; SI-NEXT: s_mov_b32 s91, vcc_lo +; SI-NEXT: s_lshr_b32 s59, s7, 8 +; SI-NEXT: s_lshr_b32 s45, s5, 8 +; SI-NEXT: s_and_b32 s65, s19, 0xffff +; SI-NEXT: s_and_b32 s51, s23, 0xffff +; SI-NEXT: s_and_b32 s37, s27, 0xffff +; SI-NEXT: s_and_b32 s95, s68, 0xffff +; SI-NEXT: s_and_b32 s89, s70, 0xffff +; SI-NEXT: s_and_b32 s75, s81, 0xffff +; SI-NEXT: s_and_b32 s61, s85, 0xffff +; SI-NEXT: s_and_b32 s47, s97, 0xffff +; SI-NEXT: s_bfe_u32 s67, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s53, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s39, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s31, s68, 0x80008 +; SI-NEXT: s_bfe_u32 s91, s70, 0x80008 +; SI-NEXT: s_bfe_u32 s77, s81, 0x80008 +; SI-NEXT: s_bfe_u32 s63, s85, 0x80008 +; SI-NEXT: s_bfe_u32 s57, s97, 0x80008 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s63, 16 -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_and_b32 s4, s46, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: s_add_i32 s99, s99, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s59, 16 -; SI-NEXT: s_add_i32 s61, s61, 3 +; SI-NEXT: s_and_b32 s5, s99, 0xffff +; SI-NEXT: s_lshl_b32 s6, s97, 16 +; SI-NEXT: s_add_i32 s98, s98, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s6, s61, 0xffff -; SI-NEXT: s_lshl_b32 s7, s57, 16 -; SI-NEXT: s_add_i32 s76, s76, 3 +; SI-NEXT: s_and_b32 s6, s98, 0xffff +; SI-NEXT: s_lshl_b32 s7, s96, 16 +; SI-NEXT: s_add_i32 s87, s87, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s7, s76, 0xffff -; SI-NEXT: s_lshl_b32 s8, s72, 16 -; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_and_b32 s7, s87, 0xffff +; SI-NEXT: s_lshl_b32 s8, s85, 16 +; SI-NEXT: s_add_i32 s86, s86, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: s_and_b32 s8, s45, 0xffff -; SI-NEXT: s_lshl_b32 s9, s62, 16 -; SI-NEXT: s_add_i32 s58, s58, 3 +; SI-NEXT: s_and_b32 s8, s86, 0xffff +; SI-NEXT: s_lshl_b32 s9, s84, 16 +; SI-NEXT: s_add_i32 s83, s83, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: s_and_b32 s9, s58, 0xffff -; SI-NEXT: s_lshl_b32 s10, s44, 16 -; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_and_b32 s9, s83, 0xffff +; SI-NEXT: s_lshl_b32 s10, s81, 16 +; SI-NEXT: s_add_i32 s82, s82, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: s_and_b32 s10, s46, 0xffff -; SI-NEXT: s_lshl_b32 s11, s99, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s97, s97, 3 -; SI-NEXT: s_add_i32 s12, s10, 0x30000 -; SI-NEXT: s_and_b32 s10, s97, 0xffff -; SI-NEXT: s_lshl_b32 s11, s96, 16 +; SI-NEXT: s_and_b32 s10, s82, 0xffff +; SI-NEXT: s_lshl_b32 s11, s80, 16 +; SI-NEXT: s_add_i32 s71, s71, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s71, 0xffff +; SI-NEXT: s_lshl_b32 s12, s70, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s13, s10, 0x30000 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s85, s85, 3 -; SI-NEXT: s_add_i32 s42, s10, 0x30000 -; SI-NEXT: s_and_b32 s10, s85, 0xffff -; SI-NEXT: s_lshl_b32 s11, s83, 16 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s43, s10, 0x30000 -; SI-NEXT: s_and_b32 s10, s24, 0xffff -; SI-NEXT: s_lshl_b32 s11, s25, 16 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: s_and_b32 s11, s26, 0xffff -; SI-NEXT: s_lshl_b32 s14, s27, 16 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_or_b32 s11, s14, s11 -; SI-NEXT: s_and_b32 s14, s20, 0xffff -; SI-NEXT: s_lshl_b32 s15, s21, 16 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: s_and_b32 s15, s22, 0xffff -; SI-NEXT: s_lshl_b32 s17, s23, 16 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_or_b32 s15, s17, s15 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s17, s31, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s28, 0xffff +; SI-NEXT: s_lshl_b32 s13, s29, 16 +; SI-NEXT: s_add_i32 s69, s69, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s40, s16, 0x30000 -; SI-NEXT: s_add_i32 s16, s93, 3 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s69, 0xffff +; SI-NEXT: s_lshl_b32 s14, s68, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_add_i32 s42, s16, 0x30000 +; SI-NEXT: s_and_b32 s16, s18, 0xffff ; SI-NEXT: s_lshl_b32 s17, s19, 16 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_and_b32 s14, s24, 0xffff +; SI-NEXT: s_lshl_b32 s15, s25, 16 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s41, s16, 0x30000 -; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 24 -; SI-NEXT: v_writelane_b32 v21, s16, 4 -; SI-NEXT: v_writelane_b32 v21, s17, 5 -; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 16 -; SI-NEXT: v_writelane_b32 v21, s16, 2 -; SI-NEXT: v_writelane_b32 v21, s17, 3 -; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 8 -; SI-NEXT: s_add_i32 s14, s14, 0x30000 -; SI-NEXT: s_add_i32 s15, s15, 0x30000 -; SI-NEXT: v_writelane_b32 v21, s16, 0 -; SI-NEXT: v_writelane_b32 v21, s17, 1 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 24 -; SI-NEXT: v_writelane_b32 v21, s16, 10 -; SI-NEXT: v_writelane_b32 v21, s17, 11 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 16 -; SI-NEXT: v_writelane_b32 v21, s16, 8 -; SI-NEXT: v_writelane_b32 v21, s17, 9 -; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 -; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: s_add_i32 s11, s11, 0x30000 -; SI-NEXT: v_writelane_b32 v21, s16, 6 -; SI-NEXT: v_writelane_b32 v21, s17, 7 -; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 24 -; SI-NEXT: v_writelane_b32 v21, s16, 16 -; SI-NEXT: v_writelane_b32 v21, s17, 17 -; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 16 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s26, 0xffff +; SI-NEXT: s_lshl_b32 s24, s27, 16 +; SI-NEXT: s_add_i32 s40, s20, 0x30000 +; SI-NEXT: s_and_b32 s20, s22, 0xffff +; SI-NEXT: s_lshl_b32 s21, s23, 16 +; SI-NEXT: s_add_i32 s43, s16, 0x30000 +; SI-NEXT: s_or_b32 s15, s24, s15 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 24 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 ; SI-NEXT: s_add_i32 s7, s7, 0x30000 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 -; SI-NEXT: v_writelane_b32 v21, s16, 14 -; SI-NEXT: v_writelane_b32 v21, s17, 15 -; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 8 -; SI-NEXT: s_lshr_b64 s[34:35], s[8:9], 24 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: s_add_i32 s41, s20, 0x30000 +; SI-NEXT: v_writelane_b32 v21, s16, 2 +; SI-NEXT: v_writelane_b32 v21, s17, 3 +; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[8:9], 24 ; SI-NEXT: s_lshr_b64 s[36:37], s[8:9], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 -; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[4:5], 8 -; SI-NEXT: v_writelane_b32 v21, s16, 12 -; SI-NEXT: s_lshr_b64 s[54:55], s[42:43], 24 -; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 8 -; SI-NEXT: s_lshr_b64 s[48:49], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[50:51], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[38:39], s[8:9], 8 -; SI-NEXT: s_lshr_b64 s[92:93], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[30:31], s[6:7], 8 -; SI-NEXT: s_lshr_b32 s47, s41, 24 -; SI-NEXT: s_lshr_b32 s74, s41, 16 -; SI-NEXT: s_lshr_b32 s60, s41, 8 -; SI-NEXT: s_lshr_b32 s56, s15, 24 -; SI-NEXT: s_lshr_b32 s98, s15, 16 -; SI-NEXT: s_lshr_b32 s87, s15, 8 -; SI-NEXT: s_lshr_b32 s86, s11, 24 -; SI-NEXT: s_lshr_b32 s84, s11, 16 -; SI-NEXT: s_lshr_b32 s82, s11, 8 -; SI-NEXT: s_lshr_b32 s81, s43, 24 -; SI-NEXT: s_lshr_b32 s80, s43, 16 -; SI-NEXT: s_lshr_b32 s71, s43, 8 -; SI-NEXT: s_lshr_b32 s70, s13, 24 -; SI-NEXT: s_lshr_b32 s69, s13, 16 -; SI-NEXT: s_lshr_b32 s68, s13, 8 +; SI-NEXT: s_lshr_b64 s[52:53], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[64:65], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[48:49], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[54:55], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v21, s16, 0 +; SI-NEXT: s_lshr_b32 s67, s43, 24 +; SI-NEXT: s_lshr_b32 s65, s43, 16 +; SI-NEXT: s_lshr_b32 s55, s43, 8 +; SI-NEXT: s_lshr_b32 s53, s41, 24 +; SI-NEXT: s_lshr_b32 s51, s41, 16 +; SI-NEXT: s_lshr_b32 s49, s41, 8 +; SI-NEXT: s_lshr_b32 s39, s15, 24 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s35, s15, 8 +; SI-NEXT: s_lshr_b32 s31, s13, 24 +; SI-NEXT: s_lshr_b32 s95, s13, 16 +; SI-NEXT: s_lshr_b32 s93, s13, 8 +; SI-NEXT: s_lshr_b32 s91, s11, 24 +; SI-NEXT: s_lshr_b32 s89, s11, 16 +; SI-NEXT: s_lshr_b32 s79, s11, 8 ; SI-NEXT: s_lshr_b32 s77, s9, 24 ; SI-NEXT: s_lshr_b32 s75, s9, 16 ; SI-NEXT: s_lshr_b32 s73, s9, 8 -; SI-NEXT: s_lshr_b32 s37, s7, 24 -; SI-NEXT: s_lshr_b32 s35, s7, 16 -; SI-NEXT: s_lshr_b32 s95, s7, 8 -; SI-NEXT: s_lshr_b32 s91, s5, 24 -; SI-NEXT: s_lshr_b32 s89, s5, 16 -; SI-NEXT: s_lshr_b32 s79, s5, 8 -; SI-NEXT: v_writelane_b32 v21, s17, 13 +; SI-NEXT: s_lshr_b32 s63, s7, 24 +; SI-NEXT: s_lshr_b32 s61, s7, 16 +; SI-NEXT: s_lshr_b32 s59, s7, 8 +; SI-NEXT: s_lshr_b32 s57, s5, 24 +; SI-NEXT: s_lshr_b32 s47, s5, 16 +; SI-NEXT: s_lshr_b32 s45, s5, 8 +; SI-NEXT: v_writelane_b32 v21, s17, 1 ; SI-NEXT: .LBB97_3: ; %end +; SI-NEXT: s_and_b32 s16, s42, 0xff +; SI-NEXT: s_lshl_b32 s17, s62, 8 ; SI-NEXT: v_readlane_b32 s18, v21, 0 -; SI-NEXT: s_and_b32 s16, s40, 0xff -; SI-NEXT: s_lshl_b32 s17, s18, 8 -; SI-NEXT: v_readlane_b32 s18, v21, 2 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s18, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 4 +; SI-NEXT: v_readlane_b32 s18, v21, 2 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: s_and_b32 s16, s43, 0xff +; SI-NEXT: s_lshl_b32 s17, s55, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s65, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s67, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xff +; SI-NEXT: s_lshl_b32 s17, s76, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s60, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s56, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s41, 0xff -; SI-NEXT: s_lshl_b32 s17, s60, 8 +; SI-NEXT: s_lshl_b32 s17, s49, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_and_b32 s17, s74, 0xff +; SI-NEXT: s_and_b32 s17, s51, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s47, 24 +; SI-NEXT: s_lshl_b32 s18, s53, 24 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 -; SI-NEXT: v_readlane_b32 s16, v21, 6 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s17, v21, 7 +; SI-NEXT: s_lshl_b32 s16, s90, 8 ; SI-NEXT: s_or_b32 s14, s14, s16 -; SI-NEXT: v_readlane_b32 s16, v21, 8 -; SI-NEXT: v_readlane_b32 s17, v21, 9 -; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s18, v21, 10 +; SI-NEXT: s_and_b32 s16, s74, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s17, s18, 24 +; SI-NEXT: s_lshl_b32 s17, s58, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xff -; SI-NEXT: s_lshl_b32 s15, s87, 8 +; SI-NEXT: s_lshl_b32 s15, s35, 8 ; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: s_and_b32 s15, s98, 0xff +; SI-NEXT: s_and_b32 s15, s37, 0xff ; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: s_lshl_b32 s16, s56, 24 +; SI-NEXT: s_lshl_b32 s16, s39, 24 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: v_readlane_b32 s14, v21, 12 -; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s14, s14, 8 -; SI-NEXT: v_readlane_b32 s15, v21, 13 -; SI-NEXT: s_or_b32 s10, s10, s14 -; SI-NEXT: v_readlane_b32 s14, v21, 14 -; SI-NEXT: v_readlane_b32 s15, v21, 15 -; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s16, v21, 16 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s14, s30, 8 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_and_b32 s14, s88, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s15, s16, 24 -; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s15, s72, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s10, s10, s14 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s11, 0xff -; SI-NEXT: s_lshl_b32 s11, s82, 8 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_and_b32 s11, s84, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s14, s86, 24 -; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_or_b32 s11, s14, s11 -; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s42, 0xff -; SI-NEXT: s_lshl_b32 s11, s66, 8 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_and_b32 s11, s64, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s14, s54, 24 -; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_or_b32 s11, s14, s11 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s43, 0xff -; SI-NEXT: s_lshl_b32 s11, s71, 8 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_and_b32 s11, s80, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s14, s81, 24 -; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_or_b32 s11, s14, s11 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xff +; SI-NEXT: s_lshl_b32 s13, s93, 8 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s95, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s14, s31, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s14, s13 ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s12, 0xff -; SI-NEXT: s_lshl_b32 s11, s52, 8 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_and_b32 s11, s50, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s12, s48, 24 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s12, s38, 8 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s12, s94, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s13, s78, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_and_b32 s10, s13, 0xff -; SI-NEXT: s_lshl_b32 s11, s68, 8 +; SI-NEXT: s_and_b32 s10, s11, 0xff +; SI-NEXT: s_lshl_b32 s11, s79, 8 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: s_and_b32 s11, s69, 0xff +; SI-NEXT: s_and_b32 s11, s89, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s12, s70, 24 +; SI-NEXT: s_lshl_b32 s12, s91, 24 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s11, s12, s11 ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 @@ -66886,11 +66676,11 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s10, s38, 8 +; SI-NEXT: s_lshl_b32 s10, s52, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: s_and_b32 s10, s36, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s11, s34, 24 +; SI-NEXT: s_lshl_b32 s11, s92, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 @@ -66912,11 +66702,11 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s8, s30, 8 +; SI-NEXT: s_lshl_b32 s8, s64, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s94, 0xff +; SI-NEXT: s_and_b32 s8, s50, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s92, 24 +; SI-NEXT: s_lshl_b32 s9, s34, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 @@ -66925,11 +66715,11 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: s_lshl_b32 s7, s95, 8 +; SI-NEXT: s_lshl_b32 s7, s59, 8 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s35, 0xff +; SI-NEXT: s_and_b32 s7, s61, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s8, s37, 24 +; SI-NEXT: s_lshl_b32 s8, s63, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 @@ -66938,11 +66728,11 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s6, s90, 8 +; SI-NEXT: s_lshl_b32 s6, s66, 8 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s88, 0xff +; SI-NEXT: s_and_b32 s6, s54, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s78, 24 +; SI-NEXT: s_lshl_b32 s7, s48, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 @@ -66951,23 +66741,20 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: s_lshl_b32 s5, s79, 8 +; SI-NEXT: s_lshl_b32 s5, s45, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s89, 0xff +; SI-NEXT: s_and_b32 s5, s47, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s91, 24 -; SI-NEXT: v_readlane_b32 s19, v21, 1 +; SI-NEXT: s_lshl_b32 s6, s57, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readlane_b32 s19, v21, 3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s19, v21, 5 +; SI-NEXT: v_readlane_b32 s19, v21, 1 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s19, v21, 11 -; SI-NEXT: v_readlane_b32 s17, v21, 17 +; SI-NEXT: v_readlane_b32 s19, v21, 3 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s99, v20, 35 ; SI-NEXT: v_readlane_b32 s98, v20, 34 @@ -67018,75 +66805,61 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v21, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v21, s4, 2 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: v_writelane_b32 v21, s5, 3 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 4 -; SI-NEXT: v_writelane_b32 v21, s5, 5 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 6 -; SI-NEXT: v_writelane_b32 v21, s5, 7 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 8 -; SI-NEXT: v_writelane_b32 v21, s5, 9 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 10 -; SI-NEXT: v_writelane_b32 v21, s5, 11 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 12 -; SI-NEXT: v_writelane_b32 v21, s5, 13 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 14 -; SI-NEXT: v_writelane_b32 v21, s5, 15 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 16 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr67 ; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr87 -; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr81 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: v_writelane_b32 v21, s5, 17 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v32i16_to_v64i8_scalar: @@ -67599,7 +67372,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v63, s30, 0 ; GFX9-NEXT: v_writelane_b32 v63, s31, 1 @@ -67716,38 +67489,36 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; GFX9-NEXT: s_cbranch_execnz .LBB97_4 ; GFX9-NEXT: .LBB97_2: ; %cmp.true -; GFX9-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s5, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] +; GFX9-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, s6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] ; GFX9-NEXT: v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] ; GFX9-NEXT: v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[7:8] ; GFX9-NEXT: v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, s6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] -; GFX9-NEXT: v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] -; GFX9-NEXT: v_pk_add_u16 v20, s19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, s18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] +; GFX9-NEXT: v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[11:12] +; GFX9-NEXT: v_pk_add_u16 v16, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, s18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v2 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 @@ -67776,16 +67547,16 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v15 ; GFX9-NEXT: s_branch .LBB97_5 ; GFX9-NEXT: .LBB97_3: ; GFX9-NEXT: ; implicit-def: $sgpr55 @@ -67838,15 +67609,16 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: s_branch .LBB97_2 ; GFX9-NEXT: .LBB97_4: -; GFX9-NEXT: v_mov_b32_e32 v21, s44 -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v18, s57 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v18, s44 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s42 -; GFX9-NEXT: v_mov_b32_e32 v19, s18 -; GFX9-NEXT: v_mov_b32_e32 v20, s19 -; GFX9-NEXT: v_mov_b32_e32 v15, s16 -; GFX9-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v14, s17 ; GFX9-NEXT: v_mov_b32_e32 v11, s14 ; GFX9-NEXT: v_mov_b32_e32 v12, s15 ; GFX9-NEXT: v_mov_b32_e32 v9, s12 @@ -67859,13 +67631,13 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v17, s55 -; GFX9-NEXT: v_mov_b32_e32 v62, s53 -; GFX9-NEXT: v_mov_b32_e32 v13, s54 +; GFX9-NEXT: v_mov_b32_e32 v25, s55 +; GFX9-NEXT: v_mov_b32_e32 v17, s53 +; GFX9-NEXT: v_mov_b32_e32 v62, s54 ; GFX9-NEXT: v_mov_b32_e32 v60, s52 ; GFX9-NEXT: v_mov_b32_e32 v61, s51 -; GFX9-NEXT: v_mov_b32_e32 v58, s50 -; GFX9-NEXT: v_mov_b32_e32 v59, s48 +; GFX9-NEXT: v_mov_b32_e32 v59, s50 +; GFX9-NEXT: v_mov_b32_e32 v58, s48 ; GFX9-NEXT: v_mov_b32_e32 v57, s49 ; GFX9-NEXT: v_mov_b32_e32 v47, s39 ; GFX9-NEXT: v_mov_b32_e32 v56, s38 @@ -67897,45 +67669,42 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v29, s60 ; GFX9-NEXT: v_mov_b32_e32 v28, s58 ; GFX9-NEXT: v_mov_b32_e32 v27, s59 -; GFX9-NEXT: v_mov_b32_e32 v14, s57 -; GFX9-NEXT: v_mov_b32_e32 v18, s56 -; GFX9-NEXT: v_mov_b32_e32 v23, s26 -; GFX9-NEXT: v_mov_b32_e32 v24, s24 -; GFX9-NEXT: v_mov_b32_e32 v25, s22 -; GFX9-NEXT: v_mov_b32_e32 v26, s20 -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s40 -; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_mov_b32_e32 v26, s56 +; GFX9-NEXT: v_mov_b32_e32 v18, s42 +; GFX9-NEXT: v_mov_b32_e32 v19, s40 +; GFX9-NEXT: v_mov_b32_e32 v20, s28 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v22, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s22 +; GFX9-NEXT: v_mov_b32_e32 v24, s20 ; GFX9-NEXT: .LBB97_5: ; %end -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v26 -; GFX9-NEXT: v_or_b32_sdwa v19, v62, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v13, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v58 -; GFX9-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v16, v60, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v15, v58, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v57 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v56 -; GFX9-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v14, v47, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v46 ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:16 @@ -67947,7 +67716,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v41 ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v21 ; GFX9-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 @@ -67959,7 +67728,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v52 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v20 ; GFX9-NEXT: v_or_b32_sdwa v9, v51, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:32 @@ -67971,7 +67740,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v19 ; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:40 @@ -67983,8 +67752,20 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s55, v63, 15 ; GFX9-NEXT: v_readlane_b32 s54, v63, 14 ; GFX9-NEXT: v_readlane_b32 s53, v63, 13 @@ -68002,29 +67783,16 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v32 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v31 -; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v29 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload @@ -68043,7 +67811,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -69227,13 +68995,13 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 @@ -69294,98 +69062,98 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v26 ; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:124 ; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v30 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v31 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v31 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v32 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:52 ; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB98_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_or_b32_sdwa v9, v29, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v41, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v46, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v12, v22, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v34, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v31, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v37, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr61 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_or_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v4, v4, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v4, v4, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -69419,11 +69187,11 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v26, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v37, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v35, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v35, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v31, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v63, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 @@ -69462,8 +69230,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr42 @@ -69474,18 +69242,18 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: .LBB98_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB98_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u16_e32 v0, 3, v16 -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v3, 0x300 ; VI-NEXT: v_add_u16_sdwa v16, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u16_e32 v0, 3, v37 +; VI-NEXT: v_add_u16_e32 v0, 3, v35 ; VI-NEXT: v_or_b32_sdwa v20, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v0, 3, v30 @@ -69508,7 +69276,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v22, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 3, v41 ; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v2, 3, v35 +; VI-NEXT: v_add_u16_e32 v2, 3, v31 ; VI-NEXT: v_add_u16_sdwa v9, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v0, 3, v55 ; VI-NEXT: v_or_b32_sdwa v14, v28, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -69519,13 +69287,13 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v0, 3, v27 ; VI-NEXT: v_or_b32_sdwa v27, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_add_u16_e32 v2, 3, v37 ; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v13, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v1, 3, v63 -; VI-NEXT: v_or_b32_sdwa v15, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 3, v31 -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v32 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v24, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 @@ -69591,34 +69359,34 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v19, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v23, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v30, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v31, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -69670,8 +69438,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill @@ -69680,12 +69448,12 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 @@ -69717,7 +69485,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:116 ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:124 ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v3 @@ -69757,18 +69525,18 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v26 ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v28 +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v28 ; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v30 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v30 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v31 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v32 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:100 +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v32 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:52 @@ -69777,78 +69545,77 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB98_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_or_b32_sdwa v9, v25, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v41, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v12, v26, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v37, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v30, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v20, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr18 -; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 @@ -69867,7 +69634,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -69884,10 +69651,10 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 ; GFX9-NEXT: v_or_b32_sdwa v13, v24, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: v_or_b32_sdwa v14, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v28, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: v_or_b32_sdwa v15, v38, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v36, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v37, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v36, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 @@ -69927,8 +69694,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr46 @@ -69937,16 +69704,16 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr22 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: .LBB98_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB98_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v38 -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v24 @@ -69959,7 +69726,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v0, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v37 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -69987,15 +69754,15 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v0, 3, v25 ; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v3, 3, v31 -; GFX9-NEXT: v_or_b32_sdwa v3, v30, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v3 -; GFX9-NEXT: v_add_u16_e32 v2, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v2 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v3, 3, v32 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v36 -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v3 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v2 ; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v1 ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v8, v25, v8, s6 @@ -70004,18 +69771,18 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v11, v20, v11, s6 ; GFX9-NEXT: v_perm_b32 v12, v16, v12, s6 ; GFX9-NEXT: v_perm_b32 v13, v18, v13, s6 -; GFX9-NEXT: v_perm_b32 v14, v30, v14, s6 +; GFX9-NEXT: v_perm_b32 v14, v31, v14, s6 ; GFX9-NEXT: v_perm_b32 v15, v28, v15, s6 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 -; GFX9-NEXT: v_or_b32_sdwa v31, v39, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 +; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v30, v39, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -70064,35 +69831,35 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v3, v21, v3, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v2, v27, v2, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v1, v29, v1, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 -; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 +; GFX9-NEXT: v_perm_b32 v0, v30, v0, s6 ; GFX9-NEXT: .LBB98_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -70926,110 +70693,131 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v46, v30 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v43, v10 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:68 ; SI-NEXT: v_readfirstlane_b32 s43, v1 ; SI-NEXT: v_readfirstlane_b32 s42, v0 -; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v19 -; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v21 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v27 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v29 -; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v17 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v25 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v35 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v35 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v48 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v39 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v34 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v39 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v49 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v38 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v32 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v30 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v31 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v30 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v31 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v33 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v34 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v33 ; SI-NEXT: s_cbranch_scc0 .LBB99_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v18, v0 -; SI-NEXT: v_or_b32_e32 v37, v1, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v38, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v56 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v44 -; SI-NEXT: v_or_b32_e32 v44, v53, v9 -; SI-NEXT: v_or_b32_e32 v33, v1, v44 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v42, v46, v9 +; SI-NEXT: v_or_b32_e32 v35, v1, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v59 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_mov_b32_e32 v15, v46 -; SI-NEXT: v_or_b32_e32 v46, v52, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v44, v57, v9 +; SI-NEXT: v_or_b32_e32 v50, v1, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v54 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v55, v3, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_mov_b32_e32 v23, v46 +; SI-NEXT: v_or_b32_e32 v46, v60, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: v_or_b32_e32 v55, v10, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v27, v13 -; SI-NEXT: v_mov_b32_e32 v58, v8 -; SI-NEXT: v_mov_b32_e32 v49, v45 -; SI-NEXT: v_mov_b32_e32 v36, v24 -; SI-NEXT: v_mov_b32_e32 v34, v26 +; SI-NEXT: v_mov_b32_e32 v27, v57 +; SI-NEXT: v_or_b32_e32 v57, v13, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v11, v41, v9 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v34, v14 +; SI-NEXT: v_mov_b32_e32 v29, v45 +; SI-NEXT: v_mov_b32_e32 v48, v63 +; SI-NEXT: v_mov_b32_e32 v33, v26 +; SI-NEXT: v_mov_b32_e32 v31, v22 +; SI-NEXT: v_mov_b32_e32 v30, v62 ; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -71049,188 +70837,167 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s7, s21, 8 ; SI-NEXT: s_or_b32 s5, s5, s7 -; SI-NEXT: s_and_b32 s7, s5, 0xffff -; SI-NEXT: s_and_b32 s5, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s23, 24 -; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: s_or_b32 s13, s7, s5 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s13, s5, s7 ; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 16 -; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_and_b32 s5, s28, 0xff ; SI-NEXT: s_lshl_b32 s9, s29, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s7, 0xffff -; SI-NEXT: s_and_b32 s7, s42, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_or_b32 s5, s5, s9 +; SI-NEXT: s_and_b32 s9, s42, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_lshl_b32 s10, s43, 24 -; SI-NEXT: s_or_b32 s7, s10, s7 -; SI-NEXT: s_or_b32 s15, s9, s7 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s12, s10, s9 +; SI-NEXT: s_or_b32 s15, s5, s12 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 16 ; SI-NEXT: s_or_b32 s4, s4, s14 -; SI-NEXT: v_mov_b32_e32 v39, v32 -; SI-NEXT: s_lshr_b32 s9, s5, 16 -; SI-NEXT: s_lshr_b32 s11, s7, 16 +; SI-NEXT: s_lshr_b32 s9, s7, 16 +; SI-NEXT: s_lshr_b32 s11, s12, 16 ; SI-NEXT: s_mov_b32 s7, s13 -; SI-NEXT: s_mov_b32 s5, s15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v43 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v48, v1, v46 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v59 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v35, v1, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v17 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v43, v13, v9 -; SI-NEXT: v_or_b32_e32 v50, v1, v43 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v19 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_mov_b32_e32 v13, v4 -; SI-NEXT: v_mov_b32_e32 v4, v2 -; SI-NEXT: v_mov_b32_e32 v2, v59 -; SI-NEXT: v_mov_b32_e32 v59, v3 -; SI-NEXT: v_mov_b32_e32 v3, v63 -; SI-NEXT: v_mov_b32_e32 v63, v40 -; SI-NEXT: v_mov_b32_e32 v40, v42 -; SI-NEXT: v_or_b32_e32 v42, v11, v9 -; SI-NEXT: v_or_b32_e32 v54, v1, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v57 +; SI-NEXT: v_mov_b32_e32 v49, v43 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: s_mov_b32 s5, s15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v1, v1, v47 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v11, v45, v9 ; SI-NEXT: v_or_b32_e32 v1, v1, v11 -; SI-NEXT: v_mov_b32_e32 v19, v10 ; SI-NEXT: v_lshr_b64 v[9:10], v[0:1], 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v47 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v23, v56, v10 -; SI-NEXT: v_mov_b32_e32 v8, v6 -; SI-NEXT: v_mov_b32_e32 v6, v14 -; SI-NEXT: v_or_b32_e32 v45, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v24 -; SI-NEXT: v_lshr_b64 v[9:10], v[44:45], 16 -; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v14, v41, v14 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v24, v17 -; SI-NEXT: v_mov_b32_e32 v17, v47 -; SI-NEXT: v_or_b32_e32 v47, v0, v14 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[9:10], v[46:47], 16 -; SI-NEXT: v_mov_b32_e32 v46, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v62 -; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: v_or_b32_e32 v10, v45, v10 +; SI-NEXT: v_or_b32_e32 v43, v0, v10 +; SI-NEXT: v_lshr_b64 v[13:14], v[42:43], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v42, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v59 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: v_mov_b32_e32 v3, v40 +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v37, v41 +; SI-NEXT: v_mov_b32_e32 v41, v18 +; SI-NEXT: v_or_b32_e32 v18, v22, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v61 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v31, v7, v15 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v9, v61 -; SI-NEXT: v_mov_b32_e32 v61, v7 -; SI-NEXT: v_mov_b32_e32 v7, v5 -; SI-NEXT: v_mov_b32_e32 v5, v52 -; SI-NEXT: v_mov_b32_e32 v52, v41 -; SI-NEXT: v_mov_b32_e32 v41, v62 -; SI-NEXT: v_mov_b32_e32 v62, v57 -; SI-NEXT: v_mov_b32_e32 v57, v53 -; SI-NEXT: v_mov_b32_e32 v53, v56 -; SI-NEXT: v_or_b32_e32 v56, v0, v31 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v51, v22 -; SI-NEXT: v_lshr_b64 v[21:22], v[55:56], 16 +; SI-NEXT: v_mov_b32_e32 v13, v51 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v0, v0, v25 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v22, v30, v15 -; SI-NEXT: v_or_b32_e32 v44, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v60 -; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_or_b32_e32 v14, v5, v14 +; SI-NEXT: v_or_b32_e32 v45, v0, v14 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[25:26], v[44:45], 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v44, v61 +; SI-NEXT: v_mov_b32_e32 v61, v58 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_mov_b32_e32 v63, v7 +; SI-NEXT: v_mov_b32_e32 v7, v47 +; SI-NEXT: v_or_b32_e32 v47, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v62 +; SI-NEXT: v_lshr_b64 v[21:22], v[46:47], 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v24 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v22, v19, v15 +; SI-NEXT: v_mov_b32_e32 v62, v5 +; SI-NEXT: v_mov_b32_e32 v5, v56 +; SI-NEXT: v_or_b32_e32 v56, v0, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[25:26], v[55:56], 16 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v38, v29, v15 -; SI-NEXT: v_lshr_b64 v[25:26], v[43:44], 16 -; SI-NEXT: v_or_b32_e32 v43, v0, v38 -; SI-NEXT: v_mov_b32_e32 v0, v30 -; SI-NEXT: v_lshr_b64 v[29:30], v[42:43], 16 -; SI-NEXT: v_mov_b32_e32 v42, v40 -; SI-NEXT: v_mov_b32_e32 v40, v63 -; SI-NEXT: v_mov_b32_e32 v63, v3 -; SI-NEXT: v_mov_b32_e32 v3, v59 -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_mov_b32_e32 v10, v19 -; SI-NEXT: v_mov_b32_e32 v2, v4 -; SI-NEXT: v_mov_b32_e32 v4, v13 -; SI-NEXT: v_mov_b32_e32 v13, v27 +; SI-NEXT: v_or_b32_e32 v26, v32, v15 +; SI-NEXT: v_or_b32_e32 v58, v0, v26 +; SI-NEXT: v_mov_b32_e32 v46, v23 +; SI-NEXT: v_lshr_b64 v[23:24], v[57:58], 16 +; SI-NEXT: v_mov_b32_e32 v57, v27 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 -; SI-NEXT: v_mov_b32_e32 v14, v6 -; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: v_mov_b32_e32 v8, v58 -; SI-NEXT: v_mov_b32_e32 v22, v51 -; SI-NEXT: v_mov_b32_e32 v51, v44 -; SI-NEXT: v_mov_b32_e32 v44, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v55, v43 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v0 -; SI-NEXT: v_mov_b32_e32 v26, v34 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v38 -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_mov_b32_e32 v34, v45 -; SI-NEXT: v_mov_b32_e32 v45, v49 -; SI-NEXT: v_mov_b32_e32 v49, v47 -; SI-NEXT: v_mov_b32_e32 v47, v17 -; SI-NEXT: v_mov_b32_e32 v17, v24 -; SI-NEXT: v_mov_b32_e32 v24, v36 -; SI-NEXT: v_mov_b32_e32 v36, v56 -; SI-NEXT: v_mov_b32_e32 v56, v53 -; SI-NEXT: v_mov_b32_e32 v53, v57 -; SI-NEXT: v_mov_b32_e32 v57, v62 -; SI-NEXT: v_mov_b32_e32 v62, v41 -; SI-NEXT: v_mov_b32_e32 v41, v52 -; SI-NEXT: v_mov_b32_e32 v52, v5 -; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: v_mov_b32_e32 v7, v61 -; SI-NEXT: v_mov_b32_e32 v61, v9 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_mov_b32_e32 v14, v34 +; SI-NEXT: v_mov_b32_e32 v26, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v36, v43 +; SI-NEXT: v_mov_b32_e32 v43, v49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v47 +; SI-NEXT: v_mov_b32_e32 v47, v7 +; SI-NEXT: v_mov_b32_e32 v7, v63 +; SI-NEXT: v_mov_b32_e32 v63, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v51, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v49, v56 +; SI-NEXT: v_mov_b32_e32 v56, v5 +; SI-NEXT: v_mov_b32_e32 v5, v62 +; SI-NEXT: v_mov_b32_e32 v62, v30 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; SI-NEXT: v_mov_b32_e32 v10, v39 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: v_mov_b32_e32 v18, v41 +; SI-NEXT: v_mov_b32_e32 v41, v37 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: v_mov_b32_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v3, v59 +; SI-NEXT: v_mov_b32_e32 v59, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v30, v58 +; SI-NEXT: v_mov_b32_e32 v58, v61 +; SI-NEXT: v_mov_b32_e32 v61, v44 ; SI-NEXT: s_cbranch_execnz .LBB99_3 ; SI-NEXT: .LBB99_2: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 @@ -71293,7 +71060,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -71302,158 +71069,159 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v9, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v54, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_or_b32_e32 v1, v32, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v55, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v50, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v48, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v51, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v49, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v35, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v62 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v33, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v22, v1 +; SI-NEXT: v_lshr_b64 v[25:26], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v36, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v34, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v52, v1 -; SI-NEXT: v_lshr_b64 v[25:26], v[50:51], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[54:55], 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v51 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v48, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v50, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[21:22], v[33:34], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v34 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v1, v5, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v49, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v51, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v33, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v35, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v34, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v36, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v3, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v1, v37, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v37, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v38, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v38, vcc, 0x3000000, v0 -; SI-NEXT: v_lshr_b64 v[0:1], v[37:38], 16 +; SI-NEXT: v_add_i32_e32 v39, vcc, 0x3000000, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[35:36], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[0:1], v[50:51], 16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[0:1], v[33:34], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[29:30], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[0:1], v[48:49], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[35:36], 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: .LBB99_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -71471,12 +71239,13 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v20, v33 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_waitcnt expcnt(0) @@ -71487,24 +71256,25 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v5, s10 ; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: v_mov_b32_e32 v7, s11 -; SI-NEXT: v_mov_b32_e32 v8, v37 +; SI-NEXT: v_mov_b32_e32 v8, v38 +; SI-NEXT: v_mov_b32_e32 v10, v39 +; SI-NEXT: v_mov_b32_e32 v12, v35 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mov_b32_e32 v10, v38 -; SI-NEXT: v_mov_b32_e32 v12, v33 +; SI-NEXT: v_mov_b32_e32 v14, v36 +; SI-NEXT: v_mov_b32_e32 v16, v50 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v14, v34 -; SI-NEXT: v_mov_b32_e32 v16, v48 +; SI-NEXT: v_mov_b32_e32 v18, v51 +; SI-NEXT: v_mov_b32_e32 v22, v34 +; SI-NEXT: v_mov_b32_e32 v24, v48 +; SI-NEXT: v_mov_b32_e32 v26, v49 +; SI-NEXT: v_mov_b32_e32 v28, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v18, v49 -; SI-NEXT: v_mov_b32_e32 v20, v35 -; SI-NEXT: v_mov_b32_e32 v22, v36 -; SI-NEXT: v_mov_b32_e32 v24, v50 -; SI-NEXT: v_mov_b32_e32 v26, v51 -; SI-NEXT: v_mov_b32_e32 v28, v54 -; SI-NEXT: v_mov_b32_e32 v30, v55 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: -; SI-NEXT: v_mov_b32_e32 v39, v32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, v51 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr8 @@ -71512,28 +71282,28 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_branch .LBB99_2 ; @@ -71628,35 +71398,65 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB99_4 ; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v35, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v20, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v39, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_and_b32 s4, s20, 0xff +; VI-NEXT: s_lshl_b32 s5, s21, 8 ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v48, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s24, 0xff +; VI-NEXT: s_lshl_b32 s5, s25, 8 ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v31, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v52, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v54, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v41, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s8 ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v42, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v43, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v45, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s8, s4, s5 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -71664,45 +71464,15 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v35, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v37, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s4, v0 -; VI-NEXT: s_and_b32 s4, s16, 0xff -; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff -; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: v_mov_b32_e32 v24, v36 ; VI-NEXT: v_mov_b32_e32 v28, v26 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: s_cbranch_execnz .LBB99_3 ; VI-NEXT: .LBB99_2: ; %cmp.true ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload @@ -71895,23 +71665,20 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v48, v30 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v34, v30 +; GFX9-NEXT: v_mov_b32_e32 v30, v28 ; GFX9-NEXT: v_mov_b32_e32 v37, v26 -; GFX9-NEXT: v_mov_b32_e32 v34, v24 -; GFX9-NEXT: v_mov_b32_e32 v32, v22 -; GFX9-NEXT: v_mov_b32_e32 v30, v20 -; GFX9-NEXT: v_mov_b32_e32 v49, v14 -; GFX9-NEXT: v_mov_b32_e32 v22, v12 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 -; GFX9-NEXT: v_mov_b32_e32 v35, v8 -; GFX9-NEXT: v_mov_b32_e32 v20, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v4 -; GFX9-NEXT: v_mov_b32_e32 v26, v2 -; GFX9-NEXT: v_mov_b32_e32 v24, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v38, v24 +; GFX9-NEXT: v_mov_b32_e32 v48, v20 +; GFX9-NEXT: v_mov_b32_e32 v33, v16 +; GFX9-NEXT: v_mov_b32_e32 v50, v14 +; GFX9-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-NEXT: v_mov_b32_e32 v49, v10 +; GFX9-NEXT: v_mov_b32_e32 v20, v8 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 @@ -71923,17 +71690,17 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v54, 8, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v55, 8, v15 @@ -71944,72 +71711,78 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v32 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v31 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v4 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v14 ; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v24 ; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v36 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v36 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v38 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v28 ; GFX9-NEXT: s_cbranch_scc0 .LBB99_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v0, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v4, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v20, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v31, v5 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v22, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v18, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v37, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 @@ -72018,43 +71791,46 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s21, 8 ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v43, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v45, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v56, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v39, v35 +; GFX9-NEXT: v_mov_b32_e32 v28, v20 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v22 +; GFX9-NEXT: v_mov_b32_e32 v26, v38 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: s_cbranch_execnz .LBB99_3 ; GFX9-NEXT: .LBB99_2: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 ; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 @@ -72068,55 +71844,51 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v40 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 ; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 ; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v52 ; GFX9-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v3 +; GFX9-NEXT: v_mov_b32_e32 v20, v28 +; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 ; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v31 ; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v26 ; GFX9-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 ; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v50 ; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 ; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 ; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 @@ -72138,16 +71910,18 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_lshl_b32 s9, s23, 8 ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: s_and_b32 s9, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 ; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_and_b32 s10, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s19, 8 -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 ; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_or_b32 s10, s11, s10 @@ -72165,13 +71939,11 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 @@ -72184,19 +71956,29 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshl_or_b32 v8, v18, 16, v8 ; GFX9-NEXT: v_lshl_or_b32 v9, v21, 16, v9 ; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v11, v29, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 ; GFX9-NEXT: v_lshl_or_b32 v12, v24, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v22, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -72224,7 +72006,11 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB99_4: -; GFX9-NEXT: v_mov_b32_e32 v31, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v35 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v28, v20 +; GFX9-NEXT: v_mov_b32_e32 v31, v22 +; GFX9-NEXT: v_mov_b32_e32 v26, v38 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB99_2 ; @@ -75619,329 +75405,326 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-LABEL: bitcast_v32bf16_to_v32f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v20, s21 +; VI-NEXT: v_mov_b32_e32 v5, s21 ; VI-NEXT: v_mov_b32_e32 v6, s22 -; VI-NEXT: v_mov_b32_e32 v19, s23 +; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 -; VI-NEXT: v_mov_b32_e32 v18, s25 -; VI-NEXT: v_mov_b32_e32 v17, s27 -; VI-NEXT: v_mov_b32_e32 v16, s29 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: v_mov_b32_e32 v12, s28 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB103_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB103_3 ; VI-NEXT: .LBB103_2: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v18, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v22, v7, v9, vcc -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; VI-NEXT: v_bfe_u32 v19, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v2 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 +; VI-NEXT: v_or_b32_e32 v20, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v19, v19, v20, vcc +; VI-NEXT: v_bfe_u32 v20, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v2 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 -; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v20, v21, vcc +; VI-NEXT: v_bfe_u32 v20, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v3 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc +; VI-NEXT: v_bfe_u32 v21, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v3 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 ; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v22, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_bfe_u32 v7, v4, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v24, v9, v11, vcc -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v4 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, v21, v22, vcc +; VI-NEXT: v_bfe_u32 v21, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v4 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 +; VI-NEXT: v_or_b32_e32 v22, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v21, v21, v22, vcc +; VI-NEXT: v_bfe_u32 v22, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v4 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v20 -; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v4, v22, v23, vcc +; VI-NEXT: v_bfe_u32 v22, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v5 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v22, v22, v23, vcc +; VI-NEXT: v_bfe_u32 v23, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v5 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 ; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v24, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_bfe_u32 v9, v6, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v20, v11, v13, vcc -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_cndmask_b32_e32 v5, v23, v24, vcc +; VI-NEXT: v_bfe_u32 v23, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v6 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 +; VI-NEXT: v_or_b32_e32 v24, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc +; VI-NEXT: v_bfe_u32 v24, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v6 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v19 -; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 -; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v6, v24, v25, vcc +; VI-NEXT: v_bfe_u32 v24, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v7 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v24, v24, v25, vcc +; VI-NEXT: v_bfe_u32 v25, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v7 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 ; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_bfe_u32 v11, v8, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v26, v13, v19, vcc -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v8 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; VI-NEXT: v_cndmask_b32_e32 v7, v25, v26, vcc +; VI-NEXT: v_bfe_u32 v25, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v8 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v9 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc +; VI-NEXT: v_bfe_u32 v26, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v8 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v13, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v18 -; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v13, v19, vcc -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v18 -; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_bfe_u32 v18, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v13 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v8, v26, v27, vcc +; VI-NEXT: v_bfe_u32 v26, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v9 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v26, v26, v27, vcc +; VI-NEXT: v_bfe_u32 v27, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v9 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 ; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_bfe_u32 v13, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_cndmask_b32_e32 v9, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v10 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 ; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 -; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v13, v13, v28, vcc -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v31, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v10 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v11, v28, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17 -; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_bfe_u32 v28, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v13 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v13, v28, v29, vcc -; VI-NEXT: v_bfe_u32 v28, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v17 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v28, v28, v29, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v11 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v30, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v11 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 ; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_bfe_u32 v17, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v12 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_cndmask_b32_e32 v11, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v12 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v13 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v17, v17, v30, vcc -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v32, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v12 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v30, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 -; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_bfe_u32 v30, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v17 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_or_b32_e32 v31, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v17, v30, v31, vcc -; VI-NEXT: v_bfe_u32 v30, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v16 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 -; VI-NEXT: v_or_b32_e32 v31, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v30, v31, vcc -; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v12, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v13 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v29, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v13 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 ; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_bfe_u32 v31, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc -; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x7fff, v31 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v15 -; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_bfe_u32 v32, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v14 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cndmask_b32_e32 v13, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v14 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 -; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[22:23] ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[24:25] -; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 -; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[20:21] -; VI-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 -; VI-NEXT: v_mov_b32_e32 v21, v23 -; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[26:27] -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[18:19] -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v14 +; VI-NEXT: v_cndmask_b32_e32 v33, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v14 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_mov_b32_e32 v19, v23 -; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[28:29] -; VI-NEXT: v_cndmask_b32_e32 v14, v15, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v32 -; VI-NEXT: v_lshrrev_b64 v[16:17], 16, v[16:17] -; VI-NEXT: v_mov_b32_e32 v17, v23 -; VI-NEXT: v_lshrrev_b64 v[23:24], 16, v[14:15] -; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[30:31] -; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[12:13] -; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[10:11] -; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[8:9] -; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5] -; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3] -; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; VI-NEXT: v_mov_b32_e32 v15, v23 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v14, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v15 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v27, v28, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[27:28], 16, v[14:15] +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_lshrrev_b64 v[14:15], 16, v[13:14] +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; VI-NEXT: v_lshrrev_b64 v[28:29], 16, v[12:13] +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v32 +; VI-NEXT: v_lshrrev_b64 v[12:13], 16, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v30 +; VI-NEXT: v_lshrrev_b64 v[29:30], 16, v[10:11] +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v31 +; VI-NEXT: v_lshrrev_b64 v[10:11], 16, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v26 +; VI-NEXT: v_lshrrev_b64 v[30:31], 16, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v25 +; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v24 +; VI-NEXT: v_lshrrev_b64 v[24:25], 16, v[6:7] +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v23 +; VI-NEXT: v_lshrrev_b64 v[6:7], 16, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; VI-NEXT: v_lshrrev_b64 v[22:23], 16, v[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v21 +; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_lshrrev_b64 v[20:21], 16, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[18:19], 16, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[16:17] +; VI-NEXT: v_mov_b32_e32 v1, v18 +; VI-NEXT: v_mov_b32_e32 v3, v20 +; VI-NEXT: v_mov_b32_e32 v5, v22 +; VI-NEXT: v_mov_b32_e32 v7, v24 +; VI-NEXT: v_mov_b32_e32 v9, v30 +; VI-NEXT: v_mov_b32_e32 v11, v29 +; VI-NEXT: v_mov_b32_e32 v13, v28 +; VI-NEXT: v_mov_b32_e32 v15, v27 ; VI-NEXT: .LBB103_3: ; %end -; VI-NEXT: v_mov_b32_e32 v1, v22 -; VI-NEXT: v_mov_b32_e32 v3, v21 -; VI-NEXT: v_mov_b32_e32 v5, v20 -; VI-NEXT: v_mov_b32_e32 v7, v19 -; VI-NEXT: v_mov_b32_e32 v9, v18 -; VI-NEXT: v_mov_b32_e32 v11, v17 -; VI-NEXT: v_mov_b32_e32 v13, v16 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB103_4: ; VI-NEXT: s_branch .LBB103_2 @@ -77757,27 +77540,23 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 +; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -77794,292 +77573,286 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr25 ; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr60 ; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr26 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB104_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v15 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v14 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v28, 24, v16 -; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v2 -; VI-NEXT: v_lshrrev_b64 v[26:27], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v52, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; VI-NEXT: .LBB104_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB104_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v17, 0x200 -; VI-NEXT: v_add_f16_sdwa v51, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; VI-NEXT: v_add_f16_sdwa v48, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v48 ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_add_f16_sdwa v53, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v29, v2, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v53 +; VI-NEXT: v_add_f16_sdwa v51, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v2, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v49, v4, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v28, v1, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; VI-NEXT: v_add_f16_sdwa v38, v4, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v1, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 ; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_add_f16_sdwa v52, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v55, v4, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v52 +; VI-NEXT: v_add_f16_sdwa v50, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v61, v4, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 ; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_add_f16_sdwa v38, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v54, v3, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; VI-NEXT: v_add_f16_sdwa v36, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v60, v3, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_add_f16_sdwa v50, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v26, v6, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; VI-NEXT: v_add_f16_sdwa v49, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v6, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 ; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 -; VI-NEXT: v_add_f16_sdwa v36, v8, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v25, v5, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_add_f16_sdwa v34, v8, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v5, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 ; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: v_add_f16_sdwa v48, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v23, v8, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v48 +; VI-NEXT: v_add_f16_sdwa v39, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v45, v8, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 ; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_add_f16_sdwa v34, v10, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v22, v7, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_add_f16_sdwa v32, v10, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v44, v7, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 ; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 -; VI-NEXT: v_add_f16_sdwa v39, v9, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v47, v10, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 +; VI-NEXT: v_add_f16_sdwa v37, v9, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v10, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v37 ; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 -; VI-NEXT: v_add_f16_sdwa v32, v12, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v46, v9, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_add_f16_sdwa v30, v12, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v9, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 ; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 -; VI-NEXT: v_add_f16_sdwa v37, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v21, v12, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v37 +; VI-NEXT: v_add_f16_sdwa v35, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v42, v12, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 ; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 -; VI-NEXT: v_add_f16_sdwa v31, v14, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v20, v11, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_add_f16_sdwa v29, v14, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v41, v11, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 ; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 -; VI-NEXT: v_add_f16_sdwa v35, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v41, v14, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_add_f16_sdwa v33, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v47, v14, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 ; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 -; VI-NEXT: v_add_f16_sdwa v30, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v33, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v40, v13, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_add_f16_sdwa v28, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v31, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v46, v13, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 ; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 ; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 -; VI-NEXT: v_or_b32_e32 v43, v16, v18 -; VI-NEXT: v_or_b32_e32 v42, v15, v17 -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[42:43] -; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[40:41] -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v43 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v42 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v41 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[20:21] -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v21 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v47 -; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[46:47] -; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v23 -; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[22:23] -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v22 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v26 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v25 -; VI-NEXT: v_lshrrev_b64 v[26:27], 24, v[25:26] -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[54:55] -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[28:29] -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v40 -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v46 -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v55 -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v54 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v29 -; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v28 -; VI-NEXT: v_bfe_u32 v28, v30, 8, 8 -; VI-NEXT: v_bfe_u32 v29, v31, 8, 8 -; VI-NEXT: v_bfe_u32 v46, v32, 8, 8 +; VI-NEXT: v_or_b32_e32 v53, v16, v18 +; VI-NEXT: v_or_b32_e32 v52, v15, v17 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[52:53] +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[46:47] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[41:42] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[22:23] +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v53 +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v22 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[44:45] +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v42 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v23 +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[24:25] +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v24 +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[60:61] +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v25 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[26:27] +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v52 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v47 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v46 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v41 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v45 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v44 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v61 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v60 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v26 +; VI-NEXT: v_bfe_u32 v26, v28, 8, 8 +; VI-NEXT: v_bfe_u32 v27, v29, 8, 8 +; VI-NEXT: v_bfe_u32 v44, v30, 8, 8 +; VI-NEXT: v_bfe_u32 v47, v32, 8, 8 ; VI-NEXT: v_bfe_u32 v57, v34, 8, 8 -; VI-NEXT: v_bfe_u32 v59, v36, 8, 8 -; VI-NEXT: v_bfe_u32 v61, v38, 8, 8 -; VI-NEXT: v_bfe_u32 v54, v49, 8, 8 -; VI-NEXT: v_bfe_u32 v40, v51, 8, 8 +; VI-NEXT: v_bfe_u32 v60, v36, 8, 8 +; VI-NEXT: v_bfe_u32 v62, v38, 8, 8 +; VI-NEXT: v_bfe_u32 v52, v48, 8, 8 ; VI-NEXT: .LBB104_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v55 -; VI-NEXT: v_or_b32_sdwa v2, v2, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v24 -; VI-NEXT: v_or_b32_sdwa v22, v53, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v51, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v52 +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v63 -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v61 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v62 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v60 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 ; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v60 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v21 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 ; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v44 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v47 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v46 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29 -; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v28 -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -78109,9 +77882,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; kill: killed $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -78128,108 +77899,107 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr29 -; GFX9-NEXT: ; implicit-def: $vgpr27 -; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr24 ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr23 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr22 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr30 -; GFX9-NEXT: ; kill: killed $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; kill: killed $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB104_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v16 ; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v16 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v16 ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX9-NEXT: v_mov_b32_e32 v26, v23 -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX9-NEXT: .LBB104_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB104_4 @@ -78249,158 +78019,155 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] ; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 24, v16 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX9-NEXT: .LBB104_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v27, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v26, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v46 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v42 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21 ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54 ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v49 ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v48 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19 ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v31 ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v30 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 ; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v26 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -79017,14 +78784,14 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 @@ -79094,22 +78861,22 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v26 ; SI-NEXT: s_or_b32 s9, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v31 +; SI-NEXT: v_readfirstlane_b32 s5, v30 ; SI-NEXT: s_or_b32 s6, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v29 ; SI-NEXT: s_or_b32 s7, s5, s4 -; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v36 ; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s5, v37 +; SI-NEXT: v_readfirstlane_b32 s5, v35 ; SI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_readfirstlane_b32 s21, v35 +; SI-NEXT: v_readfirstlane_b32 s21, v33 ; SI-NEXT: s_lshr_b64 s[22:23], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 8 ; SI-NEXT: s_lshr_b64 s[24:25], s[16:17], 24 @@ -79130,8 +78897,8 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[90:91], s[8:9], 8 ; SI-NEXT: s_lshr_b64 s[88:89], s[6:7], 24 ; SI-NEXT: s_lshr_b64 s[92:93], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[30:31], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[94:95], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 8 ; SI-NEXT: s_lshr_b32 s45, s19, 8 @@ -79144,16 +78911,16 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s21, s5, 8 ; SI-NEXT: v_bfe_u32 v48, v7, 8, 8 ; SI-NEXT: v_bfe_u32 v39, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v36, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v34, v20, 8, 8 -; SI-NEXT: v_bfe_u32 v32, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v30, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v38, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v37, v20, 8, 8 +; SI-NEXT: v_bfe_u32 v34, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v32, v3, 8, 8 ; SI-NEXT: v_bfe_u32 v19, v2, 8, 8 ; SI-NEXT: v_bfe_u32 v18, v1, 8, 8 ; SI-NEXT: s_cbranch_execnz .LBB105_3 ; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 @@ -79162,9 +78929,9 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_readfirstlane_b32 s4, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 ; SI-NEXT: v_readfirstlane_b32 s5, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 @@ -79172,7 +78939,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s6, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v30 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readfirstlane_b32 s5, v1 ; SI-NEXT: s_lshl_b32 s5, s5, 16 @@ -79332,8 +79099,8 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[90:91], s[8:9], 8 ; SI-NEXT: s_lshr_b64 s[88:89], s[6:7], 24 ; SI-NEXT: s_lshr_b64 s[92:93], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 8 -; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[30:31], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[94:95], s[4:5], 24 ; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 8 ; SI-NEXT: s_lshr_b32 s45, s19, 8 @@ -79346,10 +79113,10 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_lshr_b32 s21, s5, 8 ; SI-NEXT: v_bfe_u32 v48, v7, 8, 8 ; SI-NEXT: v_bfe_u32 v39, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v36, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v34, v20, 8, 8 -; SI-NEXT: v_bfe_u32 v32, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v30, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v38, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v37, v20, 8, 8 +; SI-NEXT: v_bfe_u32 v34, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v32, v3, 8, 8 ; SI-NEXT: v_bfe_u32 v19, v2, 8, 8 ; SI-NEXT: v_bfe_u32 v18, v1, 8, 8 ; SI-NEXT: .LBB105_3: ; %end @@ -79418,7 +79185,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v38 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_or_b32_e32 v5, s14, v5 @@ -79443,7 +79210,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v34 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v37 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_or_b32_e32 v5, s12, v5 @@ -79467,7 +79234,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v34 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_or_b32_e32 v4, s10, v4 @@ -79491,12 +79258,12 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v30 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v32 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v3, s8, v3 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s8, s94, 8 +; SI-NEXT: s_lshl_b32 s8, s30, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: s_and_b32 s8, s92, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 @@ -79524,7 +79291,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: s_and_b32 s6, s34, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s30, 24 +; SI-NEXT: s_lshl_b32 s7, s94, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -79562,56 +79329,56 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr27 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr23 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: s_branch .LBB105_2 ; ; VI-LABEL: bitcast_v32f16_to_v64i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v63, s30, 0 ; VI-NEXT: v_writelane_b32 v63, s31, 1 @@ -79734,121 +79501,122 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: .LBB105_2: ; %cmp.true ; VI-NEXT: s_lshr_b32 s20, s19, 16 ; VI-NEXT: v_mov_b32_e32 v1, 0x200 -; VI-NEXT: v_add_f16_e32 v12, s20, v1 -; VI-NEXT: v_add_f16_e32 v27, s19, v1 +; VI-NEXT: v_add_f16_e32 v13, s20, v1 +; VI-NEXT: v_add_f16_e32 v28, s19, v1 ; VI-NEXT: s_lshr_b32 s19, s18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; VI-NEXT: v_add_f16_e32 v19, s19, v1 -; VI-NEXT: v_add_f16_e32 v35, s18, v1 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; VI-NEXT: v_add_f16_e32 v20, s19, v1 +; VI-NEXT: v_add_f16_e32 v36, s18, v1 ; VI-NEXT: s_lshr_b32 s18, s17, 16 -; VI-NEXT: v_or_b32_e32 v10, v27, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; VI-NEXT: v_add_f16_e32 v13, s18, v1 -; VI-NEXT: v_add_f16_e32 v28, s17, v1 +; VI-NEXT: v_or_b32_e32 v11, v28, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; VI-NEXT: v_add_f16_e32 v14, s18, v1 +; VI-NEXT: v_add_f16_e32 v29, s17, v1 ; VI-NEXT: s_lshr_b32 s17, s16, 16 -; VI-NEXT: v_or_b32_e32 v9, v35, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; VI-NEXT: v_add_f16_e32 v20, s17, v1 -; VI-NEXT: v_add_f16_e32 v36, s16, v1 +; VI-NEXT: v_or_b32_e32 v10, v36, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; VI-NEXT: v_add_f16_e32 v21, s17, v1 +; VI-NEXT: v_add_f16_e32 v37, s16, v1 ; VI-NEXT: s_lshr_b32 s16, s15, 16 -; VI-NEXT: v_or_b32_e32 v52, v28, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; VI-NEXT: v_add_f16_e32 v14, s16, v1 -; VI-NEXT: v_add_f16_e32 v29, s15, v1 +; VI-NEXT: v_or_b32_e32 v60, v29, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; VI-NEXT: v_add_f16_e32 v15, s16, v1 +; VI-NEXT: v_add_f16_e32 v30, s15, v1 ; VI-NEXT: s_lshr_b32 s15, s14, 16 -; VI-NEXT: v_or_b32_e32 v51, v36, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; VI-NEXT: v_add_f16_e32 v21, s15, v1 -; VI-NEXT: v_add_f16_e32 v37, s14, v1 +; VI-NEXT: v_or_b32_e32 v59, v37, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; VI-NEXT: v_add_f16_e32 v22, s15, v1 +; VI-NEXT: v_add_f16_e32 v38, s14, v1 ; VI-NEXT: s_lshr_b32 s14, s13, 16 -; VI-NEXT: v_or_b32_e32 v8, v29, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; VI-NEXT: v_add_f16_e32 v15, s14, v1 -; VI-NEXT: v_add_f16_e32 v30, s13, v1 +; VI-NEXT: v_or_b32_e32 v9, v30, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; VI-NEXT: v_add_f16_e32 v16, s14, v1 +; VI-NEXT: v_add_f16_e32 v31, s13, v1 ; VI-NEXT: s_lshr_b32 s13, s12, 16 -; VI-NEXT: v_or_b32_e32 v7, v37, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 -; VI-NEXT: v_add_f16_e32 v22, s13, v1 -; VI-NEXT: v_add_f16_e32 v38, s12, v1 +; VI-NEXT: v_or_b32_e32 v8, v38, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; VI-NEXT: v_add_f16_e32 v23, s13, v1 +; VI-NEXT: v_add_f16_e32 v39, s12, v1 ; VI-NEXT: s_lshr_b32 s12, s11, 16 -; VI-NEXT: v_or_b32_e32 v57, v30, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; VI-NEXT: v_add_f16_e32 v16, s12, v1 -; VI-NEXT: v_add_f16_e32 v31, s11, v1 +; VI-NEXT: v_or_b32_e32 v47, v31, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; VI-NEXT: v_add_f16_e32 v17, s12, v1 +; VI-NEXT: v_add_f16_e32 v32, s11, v1 ; VI-NEXT: s_lshr_b32 s11, s10, 16 -; VI-NEXT: v_or_b32_e32 v56, v38, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; VI-NEXT: v_add_f16_e32 v23, s11, v1 -; VI-NEXT: v_add_f16_e32 v39, s10, v1 +; VI-NEXT: v_or_b32_e32 v46, v39, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; VI-NEXT: v_add_f16_e32 v24, s11, v1 +; VI-NEXT: v_add_f16_e32 v48, s10, v1 ; VI-NEXT: s_lshr_b32 s10, s9, 16 -; VI-NEXT: v_or_b32_e32 v6, v31, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; VI-NEXT: v_add_f16_e32 v17, s10, v1 -; VI-NEXT: v_add_f16_e32 v32, s9, v1 +; VI-NEXT: v_or_b32_e32 v7, v32, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; VI-NEXT: v_add_f16_e32 v18, s10, v1 +; VI-NEXT: v_add_f16_e32 v33, s9, v1 ; VI-NEXT: s_lshr_b32 s9, s8, 16 -; VI-NEXT: v_or_b32_e32 v5, v39, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; VI-NEXT: v_add_f16_e32 v24, s9, v1 -; VI-NEXT: v_add_f16_e32 v48, s8, v1 +; VI-NEXT: v_or_b32_e32 v6, v48, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_add_f16_e32 v25, s9, v1 +; VI-NEXT: v_add_f16_e32 v49, s8, v1 ; VI-NEXT: s_lshr_b32 s8, s7, 16 -; VI-NEXT: v_or_b32_e32 v43, v32, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; VI-NEXT: v_add_f16_e32 v18, s8, v1 -; VI-NEXT: v_add_f16_e32 v33, s7, v1 +; VI-NEXT: v_or_b32_e32 v42, v33, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; VI-NEXT: v_add_f16_e32 v19, s8, v1 +; VI-NEXT: v_add_f16_e32 v34, s7, v1 ; VI-NEXT: s_lshr_b32 s7, s6, 16 -; VI-NEXT: v_or_b32_e32 v42, v48, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; VI-NEXT: v_add_f16_e32 v25, s7, v1 -; VI-NEXT: v_add_f16_e32 v49, s6, v1 +; VI-NEXT: v_or_b32_e32 v41, v49, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; VI-NEXT: v_add_f16_e32 v26, s7, v1 +; VI-NEXT: v_add_f16_e32 v50, s6, v1 ; VI-NEXT: s_lshr_b32 s6, s5, 16 -; VI-NEXT: v_or_b32_e32 v55, v33, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; VI-NEXT: v_add_f16_e32 v11, s6, v1 -; VI-NEXT: v_add_f16_e32 v34, s5, v1 -; VI-NEXT: s_lshr_b32 s5, s4, 16 -; VI-NEXT: v_or_b32_e32 v54, v49, v2 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; VI-NEXT: v_add_f16_e32 v26, s5, v1 -; VI-NEXT: v_or_b32_e32 v41, v34, v2 +; VI-NEXT: v_or_b32_e32 v44, v34, v2 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; VI-NEXT: v_add_f16_e32 v50, s4, v1 -; VI-NEXT: v_or_b32_e32 v40, v50, v2 -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[40:41] +; VI-NEXT: v_add_f16_e32 v12, s6, v1 +; VI-NEXT: v_add_f16_e32 v35, s5, v1 +; VI-NEXT: s_lshr_b32 s5, s4, 16 +; VI-NEXT: v_or_b32_e32 v43, v50, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; VI-NEXT: v_add_f16_e32 v27, s5, v1 +; VI-NEXT: v_or_b32_e32 v53, v35, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; VI-NEXT: v_add_f16_e32 v51, s4, v1 +; VI-NEXT: v_or_b32_e32 v52, v51, v2 +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[52:53] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[2:3], 24, v[54:55] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[42:43] -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 -; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[5:6] -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v42 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v6 -; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[56:57] -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v7 -; VI-NEXT: v_lshrrev_b64 v[6:7], 24, v[7:8] -; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v56 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 -; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[51:52] +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[43:44] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[41:42] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v40 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v51 -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v9 -; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[9:10] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v55 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[6:7] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v44 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v6 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[46:47] +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v42 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; VI-NEXT: v_lshrrev_b64 v[6:7], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v8 +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[59:60] +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v9 +; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[10:11] +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v52 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v43 -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v57 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v52 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 -; VI-NEXT: v_bfe_u32 v9, v11, 8, 8 -; VI-NEXT: v_bfe_u32 v10, v18, 8, 8 -; VI-NEXT: v_bfe_u32 v40, v17, 8, 8 -; VI-NEXT: v_bfe_u32 v43, v16, 8, 8 -; VI-NEXT: v_bfe_u32 v46, v15, 8, 8 -; VI-NEXT: v_bfe_u32 v57, v14, 8, 8 -; VI-NEXT: v_bfe_u32 v60, v13, 8, 8 -; VI-NEXT: v_bfe_u32 v62, v12, 8, 8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v41 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v47 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v46 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v60 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v59 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; VI-NEXT: v_bfe_u32 v9, v12, 8, 8 +; VI-NEXT: v_bfe_u32 v10, v19, 8, 8 +; VI-NEXT: v_bfe_u32 v11, v18, 8, 8 +; VI-NEXT: v_bfe_u32 v43, v17, 8, 8 +; VI-NEXT: v_bfe_u32 v46, v16, 8, 8 +; VI-NEXT: v_bfe_u32 v57, v15, 8, 8 +; VI-NEXT: v_bfe_u32 v59, v14, 8, 8 +; VI-NEXT: v_bfe_u32 v62, v13, 8, 8 ; VI-NEXT: s_branch .LBB105_5 ; VI-NEXT: .LBB105_3: ; VI-NEXT: ; implicit-def: $sgpr30 @@ -79901,55 +79669,55 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr72 ; VI-NEXT: s_branch .LBB105_2 ; VI-NEXT: .LBB105_4: -; VI-NEXT: v_mov_b32_e32 v2, s57 -; VI-NEXT: v_mov_b32_e32 v53, s58 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v2, s56 -; VI-NEXT: v_mov_b32_e32 v52, s44 -; VI-NEXT: v_mov_b32_e32 v19, s67 -; VI-NEXT: v_mov_b32_e32 v12, s66 -; VI-NEXT: v_mov_b32_e32 v20, s65 -; VI-NEXT: v_mov_b32_e32 v13, s64 -; VI-NEXT: v_mov_b32_e32 v21, s55 -; VI-NEXT: v_mov_b32_e32 v14, s54 -; VI-NEXT: v_mov_b32_e32 v22, s53 -; VI-NEXT: v_mov_b32_e32 v15, s52 -; VI-NEXT: v_mov_b32_e32 v23, s51 -; VI-NEXT: v_mov_b32_e32 v16, s50 -; VI-NEXT: v_mov_b32_e32 v24, s49 -; VI-NEXT: v_mov_b32_e32 v17, s48 -; VI-NEXT: v_mov_b32_e32 v25, s39 -; VI-NEXT: v_mov_b32_e32 v18, s38 -; VI-NEXT: v_mov_b32_e32 v26, s37 -; VI-NEXT: v_mov_b32_e32 v11, s36 -; VI-NEXT: v_mov_b32_e32 v35, s18 -; VI-NEXT: v_mov_b32_e32 v27, s19 -; VI-NEXT: v_mov_b32_e32 v36, s16 -; VI-NEXT: v_mov_b32_e32 v28, s17 -; VI-NEXT: v_mov_b32_e32 v37, s14 -; VI-NEXT: v_mov_b32_e32 v29, s15 -; VI-NEXT: v_mov_b32_e32 v38, s12 -; VI-NEXT: v_mov_b32_e32 v30, s13 -; VI-NEXT: v_mov_b32_e32 v39, s10 -; VI-NEXT: v_mov_b32_e32 v31, s11 -; VI-NEXT: v_mov_b32_e32 v48, s8 -; VI-NEXT: v_mov_b32_e32 v32, s9 -; VI-NEXT: v_mov_b32_e32 v49, s6 -; VI-NEXT: v_mov_b32_e32 v33, s7 -; VI-NEXT: v_mov_b32_e32 v50, s4 -; VI-NEXT: v_mov_b32_e32 v34, s5 +; VI-NEXT: v_mov_b32_e32 v54, s57 +; VI-NEXT: v_mov_b32_e32 v53, s42 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s44 +; VI-NEXT: v_mov_b32_e32 v3, s58 +; VI-NEXT: v_mov_b32_e32 v20, s67 +; VI-NEXT: v_mov_b32_e32 v13, s66 +; VI-NEXT: v_mov_b32_e32 v21, s65 +; VI-NEXT: v_mov_b32_e32 v14, s64 +; VI-NEXT: v_mov_b32_e32 v22, s55 +; VI-NEXT: v_mov_b32_e32 v15, s54 +; VI-NEXT: v_mov_b32_e32 v23, s53 +; VI-NEXT: v_mov_b32_e32 v16, s52 +; VI-NEXT: v_mov_b32_e32 v24, s51 +; VI-NEXT: v_mov_b32_e32 v17, s50 +; VI-NEXT: v_mov_b32_e32 v25, s49 +; VI-NEXT: v_mov_b32_e32 v18, s48 +; VI-NEXT: v_mov_b32_e32 v26, s39 +; VI-NEXT: v_mov_b32_e32 v19, s38 +; VI-NEXT: v_mov_b32_e32 v27, s37 +; VI-NEXT: v_mov_b32_e32 v12, s36 +; VI-NEXT: v_mov_b32_e32 v36, s18 +; VI-NEXT: v_mov_b32_e32 v28, s19 +; VI-NEXT: v_mov_b32_e32 v37, s16 +; VI-NEXT: v_mov_b32_e32 v29, s17 +; VI-NEXT: v_mov_b32_e32 v38, s14 +; VI-NEXT: v_mov_b32_e32 v30, s15 +; VI-NEXT: v_mov_b32_e32 v39, s12 +; VI-NEXT: v_mov_b32_e32 v31, s13 +; VI-NEXT: v_mov_b32_e32 v48, s10 +; VI-NEXT: v_mov_b32_e32 v32, s11 +; VI-NEXT: v_mov_b32_e32 v49, s8 +; VI-NEXT: v_mov_b32_e32 v33, s9 +; VI-NEXT: v_mov_b32_e32 v50, s6 +; VI-NEXT: v_mov_b32_e32 v34, s7 +; VI-NEXT: v_mov_b32_e32 v51, s4 +; VI-NEXT: v_mov_b32_e32 v35, s5 ; VI-NEXT: v_mov_b32_e32 v62, s35 -; VI-NEXT: v_mov_b32_e32 v60, s34 +; VI-NEXT: v_mov_b32_e32 v59, s34 ; VI-NEXT: v_mov_b32_e32 v57, s31 ; VI-NEXT: v_mov_b32_e32 v46, s90 ; VI-NEXT: v_mov_b32_e32 v43, s79 -; VI-NEXT: v_mov_b32_e32 v40, s77 +; VI-NEXT: v_mov_b32_e32 v11, s77 ; VI-NEXT: v_mov_b32_e32 v10, s75 -; VI-NEXT: v_mov_b32_e32 v9, s72 -; VI-NEXT: v_mov_b32_e32 v51, s30 +; VI-NEXT: v_mov_b32_e32 v52, s30 ; VI-NEXT: v_mov_b32_e32 v1, s91 ; VI-NEXT: v_mov_b32_e32 v61, s89 -; VI-NEXT: v_mov_b32_e32 v59, s88 +; VI-NEXT: v_mov_b32_e32 v60, s88 ; VI-NEXT: v_mov_b32_e32 v58, s78 ; VI-NEXT: v_mov_b32_e32 v56, s76 ; VI-NEXT: v_mov_b32_e32 v47, s74 @@ -79957,119 +79725,107 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v44, s63 ; VI-NEXT: v_mov_b32_e32 v42, s62 ; VI-NEXT: v_mov_b32_e32 v41, s61 -; VI-NEXT: v_mov_b32_e32 v55, s60 -; VI-NEXT: v_mov_b32_e32 v54, s59 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s60 +; VI-NEXT: v_mov_b32_e32 v55, s59 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v8, s20 ; VI-NEXT: v_mov_b32_e32 v7, s22 ; VI-NEXT: v_mov_b32_e32 v6, s24 ; VI-NEXT: v_mov_b32_e32 v5, s26 ; VI-NEXT: v_mov_b32_e32 v4, s28 ; VI-NEXT: v_mov_b32_e32 v3, s40 -; VI-NEXT: v_mov_b32_e32 v2, s42 -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, s72 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s56 ; VI-NEXT: .LBB105_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v51 +; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v52 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; VI-NEXT: v_or_b32_sdwa v35, v35, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v35, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v36, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v20, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v36, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v62 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v8, v12, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v13, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 ; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v20, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v21, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v7, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v59 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v60 -; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v13, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v59 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v14, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v7, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v58 ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v21, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v6, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v56 ; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v57 -; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v6, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v47 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v23, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v5, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v46 -; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v15, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v16, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v5, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v23, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v24, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v42 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v43 -; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v17, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v4, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v3, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v40 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v3, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v54 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_readlane_b32 s67, v63, 19 ; VI-NEXT: v_readlane_b32 s66, v63, 18 ; VI-NEXT: v_readlane_b32 s65, v63, 17 @@ -80090,23 +79846,37 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v9 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v54 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v9 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v12, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -80124,7 +79894,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -80133,7 +79903,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v63, s30, 0 ; GFX9-NEXT: v_writelane_b32 v63, s31, 1 @@ -80251,10 +80021,10 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: s_cbranch_execnz .LBB105_4 ; GFX9-NEXT: .LBB105_2: ; %cmp.true ; GFX9-NEXT: v_mov_b32_e32 v1, 0x200 -; GFX9-NEXT: v_pk_add_f16 v20, s19, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, s18, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v16, s17, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v15, s16, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, s19, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, s18, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, s17, v1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, s16, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v12, s15, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, s14, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, s13, v1 op_sel_hi:[1,0] @@ -80267,22 +80037,20 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_pk_add_f16 v3, s6, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v2, s5, v1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v1, s4, v1 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[11:12] -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[26:27], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v2 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 @@ -80311,16 +80079,16 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v15 ; GFX9-NEXT: s_branch .LBB105_5 ; GFX9-NEXT: .LBB105_3: ; GFX9-NEXT: ; implicit-def: $sgpr55 @@ -80373,15 +80141,16 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: ; implicit-def: $sgpr56 ; GFX9-NEXT: s_branch .LBB105_2 ; GFX9-NEXT: .LBB105_4: -; GFX9-NEXT: v_mov_b32_e32 v21, s44 -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v18, s57 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v18, s44 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s42 -; GFX9-NEXT: v_mov_b32_e32 v19, s18 -; GFX9-NEXT: v_mov_b32_e32 v20, s19 -; GFX9-NEXT: v_mov_b32_e32 v15, s16 -; GFX9-NEXT: v_mov_b32_e32 v16, s17 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v15, s18 +; GFX9-NEXT: v_mov_b32_e32 v16, s19 +; GFX9-NEXT: v_mov_b32_e32 v13, s16 +; GFX9-NEXT: v_mov_b32_e32 v14, s17 ; GFX9-NEXT: v_mov_b32_e32 v11, s14 ; GFX9-NEXT: v_mov_b32_e32 v12, s15 ; GFX9-NEXT: v_mov_b32_e32 v9, s12 @@ -80394,13 +80163,13 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v17, s55 -; GFX9-NEXT: v_mov_b32_e32 v62, s53 -; GFX9-NEXT: v_mov_b32_e32 v13, s54 +; GFX9-NEXT: v_mov_b32_e32 v25, s55 +; GFX9-NEXT: v_mov_b32_e32 v17, s53 +; GFX9-NEXT: v_mov_b32_e32 v62, s54 ; GFX9-NEXT: v_mov_b32_e32 v60, s52 ; GFX9-NEXT: v_mov_b32_e32 v61, s51 -; GFX9-NEXT: v_mov_b32_e32 v58, s50 -; GFX9-NEXT: v_mov_b32_e32 v59, s48 +; GFX9-NEXT: v_mov_b32_e32 v59, s50 +; GFX9-NEXT: v_mov_b32_e32 v58, s48 ; GFX9-NEXT: v_mov_b32_e32 v57, s49 ; GFX9-NEXT: v_mov_b32_e32 v47, s39 ; GFX9-NEXT: v_mov_b32_e32 v56, s38 @@ -80432,45 +80201,42 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v29, s60 ; GFX9-NEXT: v_mov_b32_e32 v28, s58 ; GFX9-NEXT: v_mov_b32_e32 v27, s59 -; GFX9-NEXT: v_mov_b32_e32 v14, s57 -; GFX9-NEXT: v_mov_b32_e32 v18, s56 -; GFX9-NEXT: v_mov_b32_e32 v23, s26 -; GFX9-NEXT: v_mov_b32_e32 v24, s24 -; GFX9-NEXT: v_mov_b32_e32 v25, s22 -; GFX9-NEXT: v_mov_b32_e32 v26, s20 -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v21, s40 -; GFX9-NEXT: v_mov_b32_e32 v22, s28 +; GFX9-NEXT: v_mov_b32_e32 v26, s56 +; GFX9-NEXT: v_mov_b32_e32 v18, s42 +; GFX9-NEXT: v_mov_b32_e32 v19, s40 +; GFX9-NEXT: v_mov_b32_e32 v20, s28 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v22, s24 +; GFX9-NEXT: v_mov_b32_e32 v23, s22 +; GFX9-NEXT: v_mov_b32_e32 v24, s20 ; GFX9-NEXT: .LBB105_5: ; %end -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v26 -; GFX9-NEXT: v_or_b32_sdwa v19, v62, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v13, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v60, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v58 -; GFX9-NEXT: v_or_b32_sdwa v13, v59, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v16, v60, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v15, v58, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v57 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v56 -; GFX9-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v47, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v14, v47, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v46 ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v24 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:16 @@ -80482,7 +80248,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:20 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v41 ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v21 ; GFX9-NEXT: v_or_b32_sdwa v11, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 @@ -80494,7 +80260,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v52 ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v22 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v20 ; GFX9-NEXT: v_or_b32_sdwa v9, v51, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:32 @@ -80506,7 +80272,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:36 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v19 ; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:40 @@ -80518,8 +80284,20 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s55, v63, 15 ; GFX9-NEXT: v_readlane_b32 s54, v63, 14 ; GFX9-NEXT: v_readlane_b32 s53, v63, 13 @@ -80537,29 +80315,16 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v32 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v31 -; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v29 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload @@ -80578,7 +80343,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -81679,13 +81444,13 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 @@ -81746,98 +81511,98 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v26 ; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:124 ; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v30 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v31 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v31 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v32 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:52 ; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB106_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_or_b32_sdwa v9, v29, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v41, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v46, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v12, v22, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v34, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v31, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v37, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr61 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_or_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v4, v4, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v4, v4, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -81871,11 +81636,11 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v26, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v37, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v35, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v35, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v31, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v63, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 @@ -81914,8 +81679,8 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr42 @@ -81926,18 +81691,18 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: .LBB106_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB106_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u16_e32 v0, 3, v16 -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v3, 0x300 ; VI-NEXT: v_add_u16_sdwa v16, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u16_e32 v0, 3, v37 +; VI-NEXT: v_add_u16_e32 v0, 3, v35 ; VI-NEXT: v_or_b32_sdwa v20, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v0, 3, v30 @@ -81960,7 +81725,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v22, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 3, v41 ; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v2, 3, v35 +; VI-NEXT: v_add_u16_e32 v2, 3, v31 ; VI-NEXT: v_add_u16_sdwa v9, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v0, 3, v55 ; VI-NEXT: v_or_b32_sdwa v14, v28, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -81971,13 +81736,13 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v0, 3, v27 ; VI-NEXT: v_or_b32_sdwa v27, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_add_u16_e32 v2, 3, v37 ; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v13, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v1, 3, v63 -; VI-NEXT: v_or_b32_sdwa v15, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 3, v31 -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v32 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v24, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 @@ -82043,34 +81808,34 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v19, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v23, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v30, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v31, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -82122,8 +81887,8 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill @@ -82132,12 +81897,12 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 @@ -82169,7 +81934,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:116 ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:124 ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v3 @@ -82209,18 +81974,18 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v26 ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v28 +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v28 ; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v30 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v30 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v31 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v32 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:100 +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v32 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:52 @@ -82229,78 +81994,77 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB106_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_or_b32_sdwa v9, v25, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v41, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v12, v26, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v37, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v30, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v20, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr18 -; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 @@ -82319,7 +82083,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -82336,10 +82100,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 ; GFX9-NEXT: v_or_b32_sdwa v13, v24, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: v_or_b32_sdwa v14, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v28, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: v_or_b32_sdwa v15, v38, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v36, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v37, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v36, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 @@ -82379,8 +82143,8 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr46 @@ -82389,16 +82153,16 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr22 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: .LBB106_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB106_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v38 -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v24 @@ -82411,7 +82175,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v0, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v37 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -82439,15 +82203,15 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v0, 3, v25 ; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v3, 3, v31 -; GFX9-NEXT: v_or_b32_sdwa v3, v30, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v3 -; GFX9-NEXT: v_add_u16_e32 v2, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v2 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v3, 3, v32 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v36 -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v3 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v2 ; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v1 ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v8, v25, v8, s6 @@ -82456,18 +82220,18 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v11, v20, v11, s6 ; GFX9-NEXT: v_perm_b32 v12, v16, v12, s6 ; GFX9-NEXT: v_perm_b32 v13, v18, v13, s6 -; GFX9-NEXT: v_perm_b32 v14, v30, v14, s6 +; GFX9-NEXT: v_perm_b32 v14, v31, v14, s6 ; GFX9-NEXT: v_perm_b32 v15, v28, v15, s6 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 -; GFX9-NEXT: v_or_b32_sdwa v31, v39, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 +; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v30, v39, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -82516,35 +82280,35 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v3, v21, v3, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v2, v27, v2, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v1, v29, v1, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 -; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 +; GFX9-NEXT: v_perm_b32 v0, v30, v0, s6 ; GFX9-NEXT: .LBB106_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -83930,35 +83694,65 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB107_4 ; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v35, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v20, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v39, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_and_b32 s4, s20, 0xff +; VI-NEXT: s_lshl_b32 s5, s21, 8 ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v48, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s24, 0xff +; VI-NEXT: s_lshl_b32 s5, s25, 8 ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v31, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v52, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v54, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v41, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s8 ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v42, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v43, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v45, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s8, s4, s5 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -83966,45 +83760,15 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v35, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v37, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s4, v0 -; VI-NEXT: s_and_b32 s4, s16, 0xff -; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff -; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: v_mov_b32_e32 v24, v36 ; VI-NEXT: v_mov_b32_e32 v28, v26 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: s_cbranch_execnz .LBB107_3 ; VI-NEXT: .LBB107_2: ; %cmp.true ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload @@ -84197,23 +83961,20 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v48, v30 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v34, v30 +; GFX9-NEXT: v_mov_b32_e32 v30, v28 ; GFX9-NEXT: v_mov_b32_e32 v37, v26 -; GFX9-NEXT: v_mov_b32_e32 v34, v24 -; GFX9-NEXT: v_mov_b32_e32 v32, v22 -; GFX9-NEXT: v_mov_b32_e32 v30, v20 -; GFX9-NEXT: v_mov_b32_e32 v49, v14 -; GFX9-NEXT: v_mov_b32_e32 v22, v12 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 -; GFX9-NEXT: v_mov_b32_e32 v35, v8 -; GFX9-NEXT: v_mov_b32_e32 v20, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v4 -; GFX9-NEXT: v_mov_b32_e32 v26, v2 -; GFX9-NEXT: v_mov_b32_e32 v24, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v38, v24 +; GFX9-NEXT: v_mov_b32_e32 v48, v20 +; GFX9-NEXT: v_mov_b32_e32 v33, v16 +; GFX9-NEXT: v_mov_b32_e32 v50, v14 +; GFX9-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-NEXT: v_mov_b32_e32 v49, v10 +; GFX9-NEXT: v_mov_b32_e32 v20, v8 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 @@ -84225,17 +83986,17 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v54, 8, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v55, 8, v15 @@ -84246,72 +84007,78 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v32 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v31 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v4 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v14 ; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v24 ; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v36 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v36 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v38 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v28 ; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v0, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v4, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v20, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v31, v5 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v22, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v18, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v37, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 @@ -84320,43 +84087,46 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s21, 8 ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v43, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v45, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v56, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v39, v35 +; GFX9-NEXT: v_mov_b32_e32 v28, v20 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v22 +; GFX9-NEXT: v_mov_b32_e32 v26, v38 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: s_cbranch_execnz .LBB107_3 ; GFX9-NEXT: .LBB107_2: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 ; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 @@ -84370,55 +84140,51 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v40 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 ; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 ; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v52 ; GFX9-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v3 +; GFX9-NEXT: v_mov_b32_e32 v20, v28 +; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 ; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v31 ; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v26 ; GFX9-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 ; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v50 ; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 ; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 ; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 @@ -84440,16 +84206,18 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_lshl_b32 s9, s23, 8 ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: s_and_b32 s9, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 ; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_and_b32 s10, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s19, 8 -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 ; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_or_b32 s10, s11, s10 @@ -84467,13 +84235,11 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 @@ -84486,19 +84252,29 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: v_lshl_or_b32 v8, v18, 16, v8 ; GFX9-NEXT: v_lshl_or_b32 v9, v21, 16, v9 ; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v11, v29, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 ; GFX9-NEXT: v_lshl_or_b32 v12, v24, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v22, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -84526,7 +84302,11 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB107_4: -; GFX9-NEXT: v_mov_b32_e32 v31, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v35 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v28, v20 +; GFX9-NEXT: v_mov_b32_e32 v31, v22 +; GFX9-NEXT: v_mov_b32_e32 v26, v38 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB107_2 ; @@ -85323,8 +85103,8 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v43, 1.0, v19 ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v22 ; SI-NEXT: v_mul_f32_e32 v47, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_mul_f32_e32 v56, 1.0, v25 ; SI-NEXT: v_mul_f32_e32 v25, 1.0, v28 @@ -85335,23 +85115,23 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -85391,15 +85171,15 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_alignbit_b32 v23, v1, v52, 16 +; SI-NEXT: v_alignbit_b32 v22, v1, v52, 16 ; SI-NEXT: v_alignbit_b32 v21, v19, v49, 16 -; SI-NEXT: v_alignbit_b32 v1, v21, v23, 24 +; SI-NEXT: v_alignbit_b32 v1, v21, v22, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v21, v23, 16 +; SI-NEXT: v_alignbit_b32 v1, v21, v22, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v21, v23, 8 +; SI-NEXT: v_alignbit_b32 v1, v21, v22, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 @@ -85442,11 +85222,11 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v24 ; SI-NEXT: v_alignbit_b32 v8, v1, v47, 16 -; SI-NEXT: v_alignbit_b32 v9, v7, v24, 16 +; SI-NEXT: v_alignbit_b32 v9, v7, v23, 16 ; SI-NEXT: v_alignbit_b32 v1, v9, v8, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v9, v8, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill @@ -85459,59 +85239,60 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v5, v1, v56, 16 ; SI-NEXT: v_alignbit_b32 v6, v4, v28, 16 ; SI-NEXT: v_alignbit_b32 v1, v6, v5, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v6, v5, 16 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v18 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v6, v5, 8 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v34 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v39 ; SI-NEXT: v_alignbit_b32 v2, v1, v57, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v24 ; SI-NEXT: v_alignbit_b32 v3, v1, v29, 16 -; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v18 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v30 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v30 ; SI-NEXT: v_alignbit_b32 v20, v3, v2, 24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v34 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v9 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v15 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v6 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v6 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v3, v2, 8 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v39 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v3 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v3 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v31 -; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v33 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v50 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v33 ; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v25 -; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v12 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v15 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v12 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -85534,8 +85315,8 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr25 @@ -85556,7 +85337,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v23, v20, v19, 16 +; SI-NEXT: v_alignbit_b32 v22, v20, v19, 16 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -85612,14 +85393,15 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v16 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v21, v23, 24 +; SI-NEXT: v_alignbit_b32 v20, v21, v22, 24 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 ; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v40 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v21, v23, 16 +; SI-NEXT: v_alignbit_b32 v20, v21, v22, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 @@ -85627,31 +85409,32 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v18, v16, v18, 16 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v21, v23, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 +; SI-NEXT: v_alignbit_b32 v20, v21, v22, 8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v13 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v18, v17, 24 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_alignbit_b32 v11, v11, v10, 16 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v43 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v18, v17, 16 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_alignbit_b32 v8, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39 ; SI-NEXT: v_alignbit_b32 v15, v13, v15, 16 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v18, v17, 8 -; SI-NEXT: v_alignbit_b32 v8, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v10 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -85661,17 +85444,15 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v15, v14, 16 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 ; SI-NEXT: v_alignbit_b32 v12, v10, v12, 16 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v15, v14, 8 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v7 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v12, v11, 24 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v23 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v12, v11, 16 @@ -85686,7 +85467,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v9, v8, 24 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v9, v8, 16 ; SI-NEXT: v_alignbit_b32 v6, v4, v6, 16 @@ -85698,7 +85479,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v3, v1, v3, 16 @@ -85711,48 +85492,48 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v23 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v20, v3, v2, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v27 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v27 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v18 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v21 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v9 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v9 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v15 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v18 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v6 -; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 -; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v21 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v50 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v30 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29 -; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v29 +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v28 ; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v26 ; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v24 ; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v25 -; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v12 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v15 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v12 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v3 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v3 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: .LBB108_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v48 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 @@ -85777,31 +85558,31 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v63 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v62 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v20, v20, v24 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v20, v23, v20 +; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 @@ -85819,9 +85600,11 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v59 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v61 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 @@ -85846,11 +85629,11 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v60 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v59 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 @@ -85875,11 +85658,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v63 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 @@ -85906,9 +85687,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v61 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v60 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 @@ -85918,7 +85699,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 @@ -85935,7 +85716,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 @@ -85949,7 +85730,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v7 @@ -85966,7 +85747,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 @@ -85977,12 +85758,12 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 @@ -86030,9 +85811,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -86049,108 +85828,107 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr24 ; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr61 ; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr58 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr21 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr51 ; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; kill: killed $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr20 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; kill: killed $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB108_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v16 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] -; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v16 ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; VI-NEXT: v_mov_b32_e32 v26, v22 -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; VI-NEXT: .LBB108_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB108_4 @@ -86448,167 +86226,164 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v25, 24, v16 +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; VI-NEXT: .LBB108_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v27, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v28 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v26, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; VI-NEXT: v_or_b32_sdwa v2, v2, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 ; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v21 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 ; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v36 ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v30 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 ; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 -; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -86632,13 +86407,13 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; ; GFX9-LABEL: bitcast_v32bf16_to_v64i8: ; GFX9: ; %bb.0: +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; kill: killed $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr17 -; GFX9-NEXT: ; kill: killed $vgpr17 -; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -86655,107 +86430,107 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr22 -; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr21 ; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr20 ; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr18 ; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr55 -; GFX9-NEXT: ; kill: killed $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr25 -; GFX9-NEXT: ; implicit-def: $vgpr21 -; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr19 -; GFX9-NEXT: ; implicit-def: $vgpr18 -; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB108_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v12 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 24, v16 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] ; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v14 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 24, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v1 -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX9-NEXT: .LBB108_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB108_4 @@ -86790,7 +86565,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: s_mov_b32 s7, 0x7060302 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v19, v20, vcc -; GFX9-NEXT: v_perm_b32 v27, v1, v18, s7 +; GFX9-NEXT: v_perm_b32 v26, v1, v18, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v19, v1, 16, 1 @@ -86819,7 +86594,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v3 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc -; GFX9-NEXT: v_perm_b32 v29, v3, v19, s7 +; GFX9-NEXT: v_perm_b32 v28, v3, v19, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; GFX9-NEXT: v_bfe_u32 v20, v3, 16, 1 @@ -86848,7 +86623,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v5 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v22, vcc -; GFX9-NEXT: v_perm_b32 v31, v5, v20, s7 +; GFX9-NEXT: v_perm_b32 v24, v5, v20, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v8 ; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 @@ -86877,7 +86652,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v7 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v22, v23, vcc -; GFX9-NEXT: v_perm_b32 v33, v7, v21, s7 +; GFX9-NEXT: v_perm_b32 v30, v7, v21, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v10 ; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; GFX9-NEXT: v_bfe_u32 v22, v7, 16, 1 @@ -86894,283 +86669,282 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v10, v22, v23, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v9 ; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_add3_u32 v23, v23, v22, s6 -; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_bfe_u32 v32, v22, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc -; GFX9-NEXT: v_bfe_u32 v23, v9, 16, 1 -; GFX9-NEXT: v_add3_u32 v23, v23, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v9 +; GFX9-NEXT: v_add3_u32 v32, v32, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_bfe_u32 v22, v9, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v36, v32, v33, vcc +; GFX9-NEXT: v_add3_u32 v22, v22, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v23, v24, vcc -; GFX9-NEXT: v_perm_b32 v35, v9, v22, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v22, v32, vcc +; GFX9-NEXT: v_perm_b32 v22, v9, v36, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v12 ; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; GFX9-NEXT: v_bfe_u32 v23, v9, 16, 1 +; GFX9-NEXT: v_bfe_u32 v32, v9, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add3_u32 v23, v23, v9, s6 -; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v9 +; GFX9-NEXT: v_add3_u32 v32, v32, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v23, v24, vcc -; GFX9-NEXT: v_bfe_u32 v23, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v23, v23, v12, s6 -; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v32, v33, vcc +; GFX9-NEXT: v_bfe_u32 v32, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v32, v32, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v23, v24, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX9-NEXT: v_bfe_u32 v24, v23, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v32, v33, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v24, v24, v23, s6 -; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_bfe_u32 v34, v32, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc -; GFX9-NEXT: v_bfe_u32 v24, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v24, v24, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v11 +; GFX9-NEXT: v_add3_u32 v34, v34, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_bfe_u32 v32, v11, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v37, v34, v35, vcc +; GFX9-NEXT: v_add3_u32 v32, v32, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v24, v25, vcc -; GFX9-NEXT: v_perm_b32 v37, v11, v23, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v32, v34, vcc +; GFX9-NEXT: v_perm_b32 v32, v11, v37, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v14 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_bfe_u32 v24, v11, 16, 1 +; GFX9-NEXT: v_bfe_u32 v34, v11, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add3_u32 v24, v24, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v11 +; GFX9-NEXT: v_add3_u32 v34, v34, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v24, v25, vcc -; GFX9-NEXT: v_bfe_u32 v24, v14, 16, 1 -; GFX9-NEXT: v_add3_u32 v24, v24, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v34, v34, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v24, v25, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX9-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v34, v35, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_add3_u32 v25, v25, v24, s6 -; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_bfe_u32 v38, v34, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc -; GFX9-NEXT: v_bfe_u32 v25, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v25, v25, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v13 +; GFX9-NEXT: v_add3_u32 v38, v38, v34, s6 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; GFX9-NEXT: v_bfe_u32 v34, v13, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc +; GFX9-NEXT: v_add3_u32 v34, v34, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v25, v26, vcc -; GFX9-NEXT: v_perm_b32 v48, v13, v24, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v34, v39, vcc +; GFX9-NEXT: v_perm_b32 v34, v13, v38, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v16 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_bfe_u32 v25, v13, 16, 1 +; GFX9-NEXT: v_bfe_u32 v39, v13, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_add3_u32 v25, v25, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v13 +; GFX9-NEXT: v_add3_u32 v39, v39, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v25, v26, vcc -; GFX9-NEXT: v_bfe_u32 v25, v16, 16, 1 -; GFX9-NEXT: v_add3_u32 v25, v25, v16, s6 -; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v39, v48, vcc +; GFX9-NEXT: v_bfe_u32 v39, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v39, v39, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v16 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v25, v26, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v15 -; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v39, v48, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX9-NEXT: v_bfe_u32 v48, v39, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_add3_u32 v26, v26, v25, s6 -; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_add3_u32 v48, v48, v39, s6 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v39, vcc -; GFX9-NEXT: v_bfe_u32 v26, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v26, v26, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc +; GFX9-NEXT: v_bfe_u32 v48, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v48, v48, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v26, v39, vcc -; GFX9-NEXT: v_perm_b32 v51, v16, v13, s7 -; GFX9-NEXT: v_perm_b32 v50, v15, v25, s7 -; GFX9-NEXT: v_perm_b32 v28, v2, v17, s7 -; GFX9-NEXT: v_perm_b32 v30, v4, v1, s7 -; GFX9-NEXT: v_perm_b32 v49, v14, v11, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v48, v49, vcc +; GFX9-NEXT: v_perm_b32 v53, v16, v13, s7 +; GFX9-NEXT: v_perm_b32 v52, v15, v39, s7 +; GFX9-NEXT: v_perm_b32 v27, v2, v17, s7 +; GFX9-NEXT: v_perm_b32 v29, v4, v1, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[50:51] -; GFX9-NEXT: v_perm_b32 v32, v6, v3, s7 -; GFX9-NEXT: v_perm_b32 v38, v12, v9, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[52:53] +; GFX9-NEXT: v_perm_b32 v35, v14, v11, s7 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v25, v6, v3, s7 +; GFX9-NEXT: v_perm_b32 v33, v12, v9, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[48:49] -; GFX9-NEXT: v_perm_b32 v34, v8, v5, s7 -; GFX9-NEXT: v_perm_b32 v36, v10, v7, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[34:35] +; GFX9-NEXT: v_perm_b32 v31, v8, v5, s7 +; GFX9-NEXT: v_perm_b32 v23, v10, v7, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v20 -; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[37:38] -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v21 -; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[35:36] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[22:23] +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v22 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[30:31] ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v23 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[24:25] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v24 +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[28:29] +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v25 +; GFX9-NEXT: v_lshrrev_b64 v[24:25], 24, v[26:27] +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v26 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v22 -; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[33:34] -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v23 -; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v24 -; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[31:32] -; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[27:28] -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v51 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 24, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v36 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 24, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v31 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v29 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v27 ; GFX9-NEXT: .LBB108_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v27 -; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v28, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v51 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v24 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v37, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v29 -; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v28 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v30 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v63 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v22 ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v32 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v30 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21 ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v59 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v61 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60 ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19 ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v46 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v47 ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v48 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v46 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v42 ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 ; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v51 ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -88373,13 +88147,13 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v26, 1.0, v10 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 @@ -88401,13 +88175,13 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_readfirstlane_b32 s4, v19 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v3 -; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[4:5], 16 ; SI-NEXT: v_readfirstlane_b32 s4, v1 -; SI-NEXT: s_lshr_b32 s73, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s72, v2 -; SI-NEXT: s_lshr_b64 s[76:77], s[72:73], 16 -; SI-NEXT: s_mov_b32 s75, s76 -; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 24 +; SI-NEXT: s_lshr_b32 s75, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s74, v2 +; SI-NEXT: s_lshr_b64 s[76:77], s[74:75], 16 +; SI-NEXT: s_mov_b32 s73, s76 +; SI-NEXT: s_lshr_b64 s[4:5], s[72:73], 24 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v41, s4, 0 ; SI-NEXT: v_readfirstlane_b32 s4, v6 @@ -88426,172 +88200,173 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_readfirstlane_b32 s4, v12 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v13 -; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 ; SI-NEXT: v_readfirstlane_b32 s4, v20 -; SI-NEXT: s_lshr_b32 s25, s4, 16 +; SI-NEXT: s_lshr_b32 s29, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v24 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v25 -; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 ; SI-NEXT: v_readfirstlane_b32 s4, v22 -; SI-NEXT: s_lshr_b32 s41, s4, 16 +; SI-NEXT: s_lshr_b32 s23, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v28 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v29 -; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 16 ; SI-NEXT: v_readfirstlane_b32 s4, v26 -; SI-NEXT: s_lshr_b32 s19, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: s_lshr_b32 s17, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: v_readfirstlane_b32 s4, v32 ; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v30 +; SI-NEXT: v_readfirstlane_b32 s4, v14 ; SI-NEXT: s_lshr_b32 s11, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v18 +; SI-NEXT: v_readfirstlane_b32 s4, v16 ; SI-NEXT: v_readfirstlane_b32 s58, v5 ; SI-NEXT: v_readfirstlane_b32 s44, v9 -; SI-NEXT: v_readfirstlane_b32 s24, v21 -; SI-NEXT: v_readfirstlane_b32 s40, v23 -; SI-NEXT: v_readfirstlane_b32 s18, v27 -; SI-NEXT: v_readfirstlane_b32 s10, v31 +; SI-NEXT: v_readfirstlane_b32 s28, v21 +; SI-NEXT: v_readfirstlane_b32 s22, v23 +; SI-NEXT: v_readfirstlane_b32 s16, v27 +; SI-NEXT: v_readfirstlane_b32 s10, v30 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v17 ; SI-NEXT: s_lshr_b64 s[62:63], s[58:59], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 ; SI-NEXT: s_mov_b32 s61, s62 ; SI-NEXT: s_mov_b32 s47, s56 -; SI-NEXT: s_mov_b32 s27, s42 -; SI-NEXT: s_mov_b32 s17, s22 -; SI-NEXT: s_mov_b32 s21, s28 +; SI-NEXT: s_mov_b32 s41, s42 +; SI-NEXT: s_mov_b32 s25, s26 +; SI-NEXT: s_mov_b32 s19, s20 ; SI-NEXT: s_mov_b32 s13, s14 ; SI-NEXT: s_mov_b32 s7, s8 -; SI-NEXT: s_lshr_b64 s[88:89], s[74:75], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[74:75], 8 -; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[72:73], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[72:73], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[60:61], 24 ; SI-NEXT: s_lshr_b64 s[94:95], s[60:61], 16 ; SI-NEXT: s_lshr_b64 s[30:31], s[60:61], 8 ; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 ; SI-NEXT: s_lshr_b64 s[36:37], s[46:47], 16 ; SI-NEXT: s_lshr_b64 s[38:39], s[46:47], 8 -; SI-NEXT: s_lshr_b64 s[48:49], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[50:51], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[54:55], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[64:65], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[66:67], s[16:17], 8 -; SI-NEXT: s_lshr_b64 s[68:69], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[54:55], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[68:69], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[70:71], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[80:81], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[82:83], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[84:85], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[86:87], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[96:97], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[98:99], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 8 ; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 ; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v4 -; SI-NEXT: s_lshr_b32 s24, s76, 8 +; SI-NEXT: s_lshr_b32 s19, s76, 8 ; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v8 -; SI-NEXT: s_lshr_b32 s23, s62, 8 +; SI-NEXT: s_lshr_b32 s16, s62, 8 ; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v20 -; SI-NEXT: s_lshr_b32 s18, s56, 8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v22 -; SI-NEXT: s_lshr_b32 s17, s42, 8 -; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v26 -; SI-NEXT: s_lshr_b32 s15, s22, 8 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v30 -; SI-NEXT: s_lshr_b32 s10, s28, 8 -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v18 -; SI-NEXT: s_lshr_b32 s9, s14, 8 +; SI-NEXT: s_lshr_b32 s15, s56, 8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v22 +; SI-NEXT: s_lshr_b32 s13, s42, 8 +; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v26 +; SI-NEXT: s_lshr_b32 s10, s26, 8 +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v14 +; SI-NEXT: s_lshr_b32 s9, s20, 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v16 +; SI-NEXT: s_lshr_b32 s7, s14, 8 ; SI-NEXT: s_lshr_b32 s4, s8, 8 -; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[86:87], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[96:97], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[98:99], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[80:81], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v35 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_readfirstlane_b32 s4, v18 ; SI-NEXT: s_lshr_b32 s5, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s4, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v14 -; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v33 +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 -; SI-NEXT: v_readfirstlane_b32 s4, v16 -; SI-NEXT: v_readfirstlane_b32 s10, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_readfirstlane_b32 s10, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v30 ; SI-NEXT: s_lshr_b32 s11, s4, 16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: s_lshr_b64 s[12:13], s[10:11], 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_readfirstlane_b32 s10, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v28 -; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_readfirstlane_b32 s16, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v27 ; SI-NEXT: s_lshr_b32 s11, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v27 +; SI-NEXT: s_lshr_b32 s17, s4, 16 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v16 +; SI-NEXT: s_lshr_b64 s[18:19], s[16:17], 16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v26 -; SI-NEXT: v_readfirstlane_b32 s18, v17 +; SI-NEXT: v_readfirstlane_b32 s16, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v25 -; SI-NEXT: s_lshr_b32 s17, s4, 16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 -; SI-NEXT: s_lshr_b64 s[20:21], s[16:17], 16 ; SI-NEXT: v_readfirstlane_b32 s4, v16 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_readfirstlane_b32 s16, v17 +; SI-NEXT: v_readfirstlane_b32 s22, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 -; SI-NEXT: s_lshr_b32 s19, s4, 16 +; SI-NEXT: s_lshr_b32 s17, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: s_lshr_b32 s17, s4, 16 +; SI-NEXT: s_lshr_b32 s23, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v17 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: s_lshr_b32 s41, s4, 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[22:23], 16 +; SI-NEXT: s_lshr_b32 s23, s4, 16 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_readfirstlane_b32 s4, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v21 -; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: v_readfirstlane_b32 s28, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: s_lshr_b32 s25, s4, 16 +; SI-NEXT: s_lshr_b32 s29, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v12 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: s_lshr_b64 s[26:27], s[24:25], 16 -; SI-NEXT: s_lshr_b32 s25, s4, 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 16 +; SI-NEXT: s_lshr_b32 s29, s4, 16 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_readfirstlane_b32 s4, v10 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -88615,262 +88390,261 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_readfirstlane_b32 s58, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v19 ; SI-NEXT: v_readfirstlane_b32 s4, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_lshr_b32 s59, s4, 16 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_readfirstlane_b32 s4, v5 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: s_lshr_b32 s73, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s72, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_readfirstlane_b32 s4, v1 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: s_lshr_b64 s[74:75], s[72:73], 16 -; SI-NEXT: s_lshr_b32 s73, s4, 16 -; SI-NEXT: v_readfirstlane_b32 s72, v2 +; SI-NEXT: v_readfirstlane_b32 s72, v3 +; SI-NEXT: s_lshr_b32 s75, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s74, v2 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: s_lshr_b64 s[76:77], s[72:73], 16 -; SI-NEXT: v_readfirstlane_b32 s40, v18 -; SI-NEXT: v_readfirstlane_b32 s24, v13 +; SI-NEXT: s_lshr_b64 s[72:73], s[72:73], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[74:75], 16 +; SI-NEXT: v_readfirstlane_b32 s22, v18 +; SI-NEXT: v_readfirstlane_b32 s28, v13 ; SI-NEXT: v_readfirstlane_b32 s44, v9 -; SI-NEXT: s_mov_b32 s75, s76 +; SI-NEXT: s_mov_b32 s73, s76 ; SI-NEXT: s_lshr_b64 s[14:15], s[10:11], 16 -; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[22:23], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[42:43], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[28:29], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[44:45], 16 ; SI-NEXT: s_lshr_b64 s[62:63], s[58:59], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[74:75], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[72:73], 24 ; SI-NEXT: s_mov_b32 s7, s8 ; SI-NEXT: s_mov_b32 s13, s14 -; SI-NEXT: s_mov_b32 s21, s28 -; SI-NEXT: s_mov_b32 s17, s22 -; SI-NEXT: s_mov_b32 s27, s42 +; SI-NEXT: s_mov_b32 s19, s20 +; SI-NEXT: s_mov_b32 s25, s26 +; SI-NEXT: s_mov_b32 s41, s42 ; SI-NEXT: s_mov_b32 s47, s56 ; SI-NEXT: s_mov_b32 s61, s62 ; SI-NEXT: v_writelane_b32 v41, s78, 0 ; SI-NEXT: v_writelane_b32 v41, s79, 1 -; SI-NEXT: s_lshr_b64 s[88:89], s[74:75], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[74:75], 8 -; SI-NEXT: s_lshr_b64 s[90:91], s[60:61], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[72:73], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[72:73], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[60:61], 24 ; SI-NEXT: s_lshr_b64 s[94:95], s[60:61], 16 ; SI-NEXT: s_lshr_b64 s[30:31], s[60:61], 8 ; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 24 ; SI-NEXT: s_lshr_b64 s[36:37], s[46:47], 16 ; SI-NEXT: s_lshr_b64 s[38:39], s[46:47], 8 -; SI-NEXT: s_lshr_b64 s[48:49], s[26:27], 24 -; SI-NEXT: s_lshr_b64 s[50:51], s[26:27], 16 -; SI-NEXT: s_lshr_b64 s[52:53], s[26:27], 8 -; SI-NEXT: s_lshr_b64 s[54:55], s[16:17], 24 -; SI-NEXT: s_lshr_b64 s[64:65], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[66:67], s[16:17], 8 -; SI-NEXT: s_lshr_b64 s[68:69], s[20:21], 24 -; SI-NEXT: s_lshr_b64 s[70:71], s[20:21], 16 -; SI-NEXT: s_lshr_b32 s24, s76, 8 -; SI-NEXT: s_lshr_b32 s23, s62, 8 -; SI-NEXT: s_lshr_b32 s18, s56, 8 -; SI-NEXT: s_lshr_b32 s17, s42, 8 -; SI-NEXT: s_lshr_b32 s15, s22, 8 -; SI-NEXT: s_lshr_b32 s10, s28, 8 -; SI-NEXT: s_lshr_b32 s9, s14, 8 +; SI-NEXT: s_lshr_b64 s[48:49], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[54:55], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[68:69], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[70:71], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[80:81], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[82:83], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[84:85], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[86:87], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[96:97], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[98:99], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 8 +; SI-NEXT: s_lshr_b32 s19, s76, 8 +; SI-NEXT: s_lshr_b32 s16, s62, 8 +; SI-NEXT: s_lshr_b32 s15, s56, 8 +; SI-NEXT: s_lshr_b32 s13, s42, 8 +; SI-NEXT: s_lshr_b32 s10, s26, 8 +; SI-NEXT: s_lshr_b32 s9, s20, 8 +; SI-NEXT: s_lshr_b32 s7, s14, 8 ; SI-NEXT: s_lshr_b32 s4, s8, 8 ; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v1 ; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v4 ; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v8 ; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v17 -; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v14 ; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v14 -; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 -; SI-NEXT: s_lshr_b64 s[86:87], s[12:13], 24 -; SI-NEXT: s_lshr_b64 s[96:97], s[12:13], 16 -; SI-NEXT: s_lshr_b64 s[98:99], s[12:13], 8 -; SI-NEXT: s_lshr_b64 s[80:81], s[6:7], 24 -; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 16 -; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 8 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: s_and_b32 s7, s74, 0xff -; SI-NEXT: s_lshl_b32 s13, s92, 8 -; SI-NEXT: s_or_b32 s7, s7, s13 -; SI-NEXT: s_and_b32 s13, s88, 0xff -; SI-NEXT: v_readlane_b32 s74, v41, 0 -; SI-NEXT: s_lshl_b32 s21, s74, 24 -; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: s_or_b32 s13, s21, s13 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s13 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_and_b32 s7, s76, 0xff -; SI-NEXT: s_lshl_b32 s13, s24, 8 -; SI-NEXT: s_or_b32 s7, s7, s13 -; SI-NEXT: s_and_b32 s13, s73, 0xff -; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v48 -; SI-NEXT: v_or_b32_e32 v2, s13, v2 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: v_or_b32_e32 v2, s7, v2 -; SI-NEXT: s_and_b32 s7, s60, 0xff -; SI-NEXT: s_lshl_b32 s13, s30, 8 -; SI-NEXT: s_or_b32 s7, s7, s13 -; SI-NEXT: s_and_b32 s13, s94, 0xff -; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: s_lshl_b32 s21, s90, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s13, s21, s13 +; SI-NEXT: s_and_b32 s21, s72, 0xff +; SI-NEXT: s_lshl_b32 s22, s90, 8 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: s_and_b32 s22, s88, 0xff +; SI-NEXT: v_readlane_b32 s72, v41, 0 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s25, s72, 24 +; SI-NEXT: s_and_b32 s21, s21, 0xffff +; SI-NEXT: s_or_b32 s22, s25, s22 +; SI-NEXT: s_or_b32 s21, s21, s22 +; SI-NEXT: v_mov_b32_e32 v1, s21 +; SI-NEXT: s_and_b32 s21, s76, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 8 +; SI-NEXT: s_or_b32 s19, s21, s19 +; SI-NEXT: s_and_b32 s21, s75, 0xff ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s7, s7, s13 -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v48 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s21, v1 +; SI-NEXT: v_or_b32_e32 v1, s19, v1 +; SI-NEXT: s_and_b32 s19, s60, 0xff +; SI-NEXT: s_lshl_b32 s21, s30, 8 +; SI-NEXT: s_or_b32 s19, s19, s21 +; SI-NEXT: s_and_b32 s21, s94, 0xff +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_lshl_b32 s22, s92, 24 +; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: s_or_b32 s19, s19, s21 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s19 +; SI-NEXT: s_and_b32 s19, s62, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s62, 0xff -; SI-NEXT: s_lshl_b32 s13, s23, 8 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s7, s7, s13 -; SI-NEXT: s_and_b32 s13, s59, 0xff +; SI-NEXT: s_or_b32 s16, s19, s16 +; SI-NEXT: s_and_b32 s19, s59, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v39 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: v_or_b32_e32 v1, s13, v1 -; SI-NEXT: v_or_b32_e32 v1, s7, v1 -; SI-NEXT: s_and_b32 s7, s46, 0xff -; SI-NEXT: s_lshl_b32 s13, s38, 8 -; SI-NEXT: s_or_b32 s7, s7, s13 -; SI-NEXT: s_and_b32 s13, s36, 0xff -; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s19, v1 +; SI-NEXT: v_or_b32_e32 v1, s16, v1 +; SI-NEXT: s_and_b32 s16, s46, 0xff +; SI-NEXT: s_lshl_b32 s19, s38, 8 +; SI-NEXT: s_or_b32 s16, s16, s19 +; SI-NEXT: s_and_b32 s19, s36, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s21, s34, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s13, s21, s13 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s19, s21, s19 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_or_b32 s16, s16, s19 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s56, 0xff -; SI-NEXT: s_lshl_b32 s13, s18, 8 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s56, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s7, s7, s13 -; SI-NEXT: s_and_b32 s13, s45, 0xff +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: s_and_b32 s16, s45, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v38 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: v_or_b32_e32 v1, s13, v1 -; SI-NEXT: v_or_b32_e32 v1, s7, v1 -; SI-NEXT: s_and_b32 s7, s26, 0xff -; SI-NEXT: s_lshl_b32 s13, s52, 8 -; SI-NEXT: s_or_b32 s7, s7, s13 -; SI-NEXT: s_and_b32 s13, s50, 0xff -; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: s_lshl_b32 s18, s48, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s13, s18, s13 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s16, v1 +; SI-NEXT: v_or_b32_e32 v1, s15, v1 +; SI-NEXT: s_and_b32 s15, s40, 0xff +; SI-NEXT: s_lshl_b32 s16, s52, 8 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s16, s50, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s19, s48, 24 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_or_b32 s16, s19, s16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_or_b32 s15, s15, s16 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s42, 0xff -; SI-NEXT: s_lshl_b32 s13, s17, 8 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s42, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_or_b32 s7, s7, s13 -; SI-NEXT: s_and_b32 s13, s25, 0xff +; SI-NEXT: s_or_b32 s13, s15, s13 +; SI-NEXT: s_and_b32 s15, s29, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s15, s15, 16 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v37 -; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: v_or_b32_e32 v1, s15, v1 ; SI-NEXT: v_or_b32_e32 v1, s13, v1 -; SI-NEXT: v_or_b32_e32 v1, s7, v1 -; SI-NEXT: s_and_b32 s7, s16, 0xff -; SI-NEXT: s_lshl_b32 s13, s66, 8 -; SI-NEXT: s_or_b32 s7, s7, s13 -; SI-NEXT: s_and_b32 s13, s64, 0xff -; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_and_b32 s13, s24, 0xff +; SI-NEXT: s_lshl_b32 s15, s66, 8 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: s_and_b32 s15, s64, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 16 ; SI-NEXT: s_lshl_b32 s16, s54, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s13, s16, s13 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_or_b32 s13, s13, s15 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_lshl_b32 s13, s15, 8 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s26, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s7, s7, s13 -; SI-NEXT: s_and_b32 s13, s41, 0xff +; SI-NEXT: s_or_b32 s10, s13, s10 +; SI-NEXT: s_and_b32 s13, s23, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s13, s13, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v34 -; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v36 +; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s13, v1 -; SI-NEXT: v_or_b32_e32 v1, s7, v1 -; SI-NEXT: s_and_b32 s7, s20, 0xff -; SI-NEXT: s_lshl_b32 s13, s78, 8 -; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: v_or_b32_e32 v1, s10, v1 +; SI-NEXT: s_and_b32 s10, s18, 0xff +; SI-NEXT: s_lshl_b32 s13, s80, 8 +; SI-NEXT: s_or_b32 s10, s10, s13 ; SI-NEXT: s_and_b32 s13, s70, 0xff ; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_lshl_b32 s15, s68, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s13, s15, s13 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: s_or_b32 s7, s7, s13 +; SI-NEXT: s_or_b32 s10, s10, s13 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s28, 0xff -; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s20, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: s_or_b32 s7, s7, s10 -; SI-NEXT: s_and_b32 s10, s19, 0xff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s17, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v16 -; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v35 +; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s10, v1 -; SI-NEXT: v_or_b32_e32 v1, s7, v1 -; SI-NEXT: s_and_b32 s7, s12, 0xff -; SI-NEXT: s_lshl_b32 s10, s98, 8 -; SI-NEXT: s_or_b32 s7, s7, s10 -; SI-NEXT: s_and_b32 s10, s96, 0xff +; SI-NEXT: v_or_b32_e32 v1, s9, v1 +; SI-NEXT: s_and_b32 s9, s12, 0xff +; SI-NEXT: s_lshl_b32 s10, s86, 8 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s84, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s12, s86, 24 -; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s12, s82, 24 +; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s10, s12, s10 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: s_or_b32 s7, s7, s10 +; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: s_and_b32 s7, s14, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s14, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_and_b32 s9, s11, 0xff ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v18 ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s9, v1 ; SI-NEXT: v_or_b32_e32 v1, s7, v1 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s7, s84, 8 +; SI-NEXT: s_lshl_b32 s7, s78, 8 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s82, 0xff +; SI-NEXT: s_and_b32 s7, s98, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_lshl_b32 s9, s80, 24 +; SI-NEXT: s_lshl_b32 s9, s96, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_waitcnt expcnt(0) @@ -88886,12 +88660,12 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s4, s6, s4 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 -; SI-NEXT: v_readlane_b32 s75, v41, 1 +; SI-NEXT: v_readlane_b32 s73, v41, 1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s99, v40, 35 ; SI-NEXT: v_readlane_b32 s98, v40, 34 @@ -88939,70 +88713,70 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v41, s4, 0 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: v_writelane_b32 v41, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr25 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 ; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v32bf16_to_v64i8_scalar: @@ -89080,7 +88854,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_lshr_b32 s29, s4, 8 ; VI-NEXT: s_lshr_b32 s41, s7, 24 ; VI-NEXT: s_lshr_b32 s47, s7, 16 -; VI-NEXT: s_lshr_b32 s57, s7, 8 +; VI-NEXT: s_lshr_b32 s61, s7, 8 ; VI-NEXT: s_lshr_b32 s88, s6, 16 ; VI-NEXT: s_lshr_b32 s89, s6, 8 ; VI-NEXT: s_lshr_b32 s90, s9, 24 @@ -89115,8 +88889,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_lshr_b32 s83, s18, 8 ; VI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 ; VI-NEXT: s_lshr_b64 s[44:45], s[6:7], 24 -; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 -; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[56:57], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[10:11], 24 ; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 ; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 ; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 @@ -89128,7 +88902,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_mov_b32 s28, s11 ; VI-NEXT: s_mov_b32 s40, s9 ; VI-NEXT: s_mov_b32 s46, s7 -; VI-NEXT: s_mov_b32 s56, s5 +; VI-NEXT: s_mov_b32 s60, s5 ; VI-NEXT: s_cbranch_execnz .LBB109_3 ; VI-NEXT: .LBB109_2: ; %cmp.true ; VI-NEXT: s_lshl_b32 s20, s19, 16 @@ -89466,7 +89240,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_readfirstlane_b32 s5, v2 ; VI-NEXT: s_bfe_u32 s21, s5, 0x10010 ; VI-NEXT: s_add_i32 s21, s21, s5 -; VI-NEXT: s_lshr_b64 s[56:57], s[42:43], 16 +; VI-NEXT: s_lshr_b64 s[60:61], s[42:43], 16 ; VI-NEXT: s_addk_i32 s21, 0x7fff ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_bitset1_b32 s5, 22 @@ -89491,17 +89265,23 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_mov_b32 s11, s28 ; VI-NEXT: s_mov_b32 s9, s40 ; VI-NEXT: s_mov_b32 s7, s46 -; VI-NEXT: s_mov_b32 s5, s56 +; VI-NEXT: s_mov_b32 s5, s60 ; VI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 ; VI-NEXT: s_lshr_b64 s[44:45], s[6:7], 24 -; VI-NEXT: s_lshr_b32 s21, s56, 24 -; VI-NEXT: s_lshr_b32 s23, s56, 16 -; VI-NEXT: s_lshr_b32 s25, s56, 8 +; VI-NEXT: s_lshr_b64 s[56:57], s[8:9], 24 +; VI-NEXT: s_lshr_b64 s[58:59], s[10:11], 24 +; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 +; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 +; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 +; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 +; VI-NEXT: s_lshr_b32 s21, s60, 24 +; VI-NEXT: s_lshr_b32 s23, s60, 16 +; VI-NEXT: s_lshr_b32 s25, s60, 8 ; VI-NEXT: s_lshr_b32 s27, s4, 16 ; VI-NEXT: s_lshr_b32 s29, s4, 8 ; VI-NEXT: s_lshr_b32 s41, s46, 24 ; VI-NEXT: s_lshr_b32 s47, s46, 16 -; VI-NEXT: s_lshr_b32 s57, s46, 8 +; VI-NEXT: s_lshr_b32 s61, s46, 8 ; VI-NEXT: s_lshr_b32 s88, s6, 16 ; VI-NEXT: s_lshr_b32 s89, s6, 8 ; VI-NEXT: s_lshr_b32 s90, s40, 24 @@ -89534,19 +89314,13 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_lshr_b32 s81, s20, 8 ; VI-NEXT: s_lshr_b32 s82, s18, 16 ; VI-NEXT: s_lshr_b32 s83, s18, 8 -; VI-NEXT: s_lshr_b64 s[58:59], s[8:9], 24 -; VI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 -; VI-NEXT: s_lshr_b64 s[62:63], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[18:19], 24 ; VI-NEXT: .LBB109_3: ; %end ; VI-NEXT: s_and_b32 s5, s18, 0xff ; VI-NEXT: s_lshl_b32 s7, s83, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 -; VI-NEXT: s_lshl_b32 s7, s76, 8 -; VI-NEXT: s_and_b32 s9, s82, 0xff -; VI-NEXT: s_or_b32 s7, s9, s7 +; VI-NEXT: s_and_b32 s7, s82, 0xff +; VI-NEXT: s_lshl_b32 s9, s76, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s5, s5, s7 @@ -89638,7 +89412,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_lshl_b32 s7, s39, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: s_and_b32 s7, s38, 0xff -; VI-NEXT: s_lshl_b32 s9, s60, 8 +; VI-NEXT: s_lshl_b32 s9, s58, 8 ; VI-NEXT: s_or_b32 s7, s7, s9 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -89662,7 +89436,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_lshl_b32 s7, s34, 8 ; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: s_and_b32 s7, s31, 0xff -; VI-NEXT: s_lshl_b32 s8, s58, 8 +; VI-NEXT: s_lshl_b32 s8, s56, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -89695,7 +89469,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s46, 0xff -; VI-NEXT: s_lshl_b32 s6, s57, 8 +; VI-NEXT: s_lshl_b32 s6, s61, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s6, s47, 0xff ; VI-NEXT: s_lshl_b32 s7, s41, 8 @@ -89718,7 +89492,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: s_and_b32 s4, s56, 0xff +; VI-NEXT: s_and_b32 s4, s60, 0xff ; VI-NEXT: s_lshl_b32 s5, s25, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s23, 0xff @@ -89796,14 +89570,14 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: ; implicit-def: $sgpr48 ; VI-NEXT: ; implicit-def: $sgpr39 ; VI-NEXT: ; implicit-def: $sgpr38 -; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: ; implicit-def: $sgpr28 ; VI-NEXT: ; implicit-def: $sgpr37 ; VI-NEXT: ; implicit-def: $sgpr36 ; VI-NEXT: ; implicit-def: $sgpr35 ; VI-NEXT: ; implicit-def: $sgpr34 ; VI-NEXT: ; implicit-def: $sgpr31 -; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: ; implicit-def: $sgpr40 ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr91 @@ -89812,13 +89586,13 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: ; implicit-def: $sgpr88 ; VI-NEXT: ; implicit-def: $sgpr44 ; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr41 ; VI-NEXT: ; implicit-def: $sgpr29 ; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: ; implicit-def: $sgpr42 -; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr60 ; VI-NEXT: ; implicit-def: $sgpr25 ; VI-NEXT: ; implicit-def: $sgpr23 ; VI-NEXT: ; implicit-def: $sgpr21 @@ -92116,13 +91890,13 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 @@ -92183,98 +91957,98 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v26 ; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:124 ; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v30 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v31 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v31 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v32 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:84 ; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:76 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:52 ; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB110_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_or_b32_sdwa v9, v29, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v41, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v46, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v12, v22, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v34, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v31, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v37, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr22 ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr61 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_or_b32_sdwa v0, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v4, v4, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v7, v7, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v4, v4, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -92308,11 +92082,11 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v26, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v37, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v35, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v35, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v31, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v63, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; kill: killed $vgpr16 @@ -92351,8 +92125,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr18 ; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr16 ; VI-NEXT: ; implicit-def: $vgpr42 @@ -92363,18 +92137,18 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: .LBB110_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB110_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u16_e32 v0, 3, v16 -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v3, 0x300 ; VI-NEXT: v_add_u16_sdwa v16, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u16_e32 v0, 3, v37 +; VI-NEXT: v_add_u16_e32 v0, 3, v35 ; VI-NEXT: v_or_b32_sdwa v20, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_u16_e32 v0, 3, v30 @@ -92397,7 +92171,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v22, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 3, v41 ; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v2, 3, v35 +; VI-NEXT: v_add_u16_e32 v2, 3, v31 ; VI-NEXT: v_add_u16_sdwa v9, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v0, 3, v55 ; VI-NEXT: v_or_b32_sdwa v14, v28, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -92408,13 +92182,13 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v0, 3, v27 ; VI-NEXT: v_or_b32_sdwa v27, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_add_u16_e32 v2, 3, v37 ; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v13, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v1, 3, v63 -; VI-NEXT: v_or_b32_sdwa v15, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v1, 3, v31 -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v32 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v24, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 ; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 @@ -92480,34 +92254,34 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v19, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v23, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v30, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v31, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v0 ; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v3, 3, v3 ; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -92559,8 +92333,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill @@ -92569,12 +92343,12 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 @@ -92606,7 +92380,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:116 ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:124 ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v3 @@ -92646,18 +92420,18 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v26 ; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v28 +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v28 ; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v30 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v30 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v31 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v32 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:100 +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v32 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:100 ; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:92 ; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:76 ; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:52 @@ -92666,78 +92440,77 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB110_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_or_b32_sdwa v9, v25, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v41, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v12, v26, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v37, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v30, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v20, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v32, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr25 ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr26 -; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr20 -; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr18 -; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr17 ; GFX9-NEXT: ; implicit-def: $vgpr19 ; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 @@ -92756,7 +92529,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -92773,10 +92546,10 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 ; GFX9-NEXT: v_or_b32_sdwa v13, v24, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 -; GFX9-NEXT: v_or_b32_sdwa v14, v32, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v28, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 -; GFX9-NEXT: v_or_b32_sdwa v15, v38, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v36, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v37, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v36, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; kill: killed $vgpr16 @@ -92816,8 +92589,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr16 ; GFX9-NEXT: ; implicit-def: $vgpr24 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr46 @@ -92826,16 +92599,16 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr22 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: .LBB110_2: ; %Flow ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB110_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_add_u16_e32 v0, 3, v38 -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v24 @@ -92848,7 +92621,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v0, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v0 -; GFX9-NEXT: v_add_u16_e32 v0, 3, v37 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -92876,15 +92649,15 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v0, 3, v25 ; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u16_e32 v3, 3, v31 -; GFX9-NEXT: v_or_b32_sdwa v3, v30, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v3 -; GFX9-NEXT: v_add_u16_e32 v2, 3, v32 -; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v2 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v3, 3, v32 ; GFX9-NEXT: v_add_u16_e32 v1, 3, v36 -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v3 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v2 ; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v1 ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v8, v25, v8, s6 @@ -92893,18 +92666,18 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v11, v20, v11, s6 ; GFX9-NEXT: v_perm_b32 v12, v16, v12, s6 ; GFX9-NEXT: v_perm_b32 v13, v18, v13, s6 -; GFX9-NEXT: v_perm_b32 v14, v30, v14, s6 +; GFX9-NEXT: v_perm_b32 v14, v31, v14, s6 ; GFX9-NEXT: v_perm_b32 v15, v28, v15, s6 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 -; GFX9-NEXT: v_or_b32_sdwa v31, v39, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 +; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v30, v39, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v30 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -92953,35 +92726,35 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v3, v21, v3, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v2, v27, v2, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v1, v29, v1, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 -; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 +; GFX9-NEXT: v_perm_b32 v0, v30, v0, s6 ; GFX9-NEXT: .LBB110_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -94221,29 +93994,29 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_branch .LBB111_2 ; SI-NEXT: .LBB111_4: @@ -94371,35 +94144,65 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB111_4 ; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v35, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v20, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v39, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s6, s4, s5 +; VI-NEXT: s_and_b32 s4, s20, 0xff +; VI-NEXT: s_lshl_b32 s5, s21, 8 ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v48, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s7 ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s7, s4, s5 +; VI-NEXT: s_and_b32 s4, s24, 0xff +; VI-NEXT: s_lshl_b32 s5, s25, 8 ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v31, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v52, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v54, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v41, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s5, s5, s8 ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v42, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v43, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v45, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s8, s4, s5 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -94407,45 +94210,15 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_or_b32_sdwa v2, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v35, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_or_b32_sdwa v0, v37, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, s4, v0 -; VI-NEXT: s_and_b32 s4, s16, 0xff -; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s20, 0xff -; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_or_b32 s5, s5, s6 -; VI-NEXT: s_and_b32 s6, s24, 0xff -; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_or_b32 s7, s7, s8 -; VI-NEXT: s_and_b32 s6, s6, 0xffff -; VI-NEXT: s_lshl_b32 s7, s7, 16 -; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: v_mov_b32_e32 v24, v36 ; VI-NEXT: v_mov_b32_e32 v28, v26 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_or_b32_e32 v3, s4, v0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: s_cbranch_execnz .LBB111_3 ; VI-NEXT: .LBB111_2: ; %cmp.true ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload @@ -94638,23 +94411,20 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v48, v30 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v34, v30 +; GFX9-NEXT: v_mov_b32_e32 v30, v28 ; GFX9-NEXT: v_mov_b32_e32 v37, v26 -; GFX9-NEXT: v_mov_b32_e32 v34, v24 -; GFX9-NEXT: v_mov_b32_e32 v32, v22 -; GFX9-NEXT: v_mov_b32_e32 v30, v20 -; GFX9-NEXT: v_mov_b32_e32 v49, v14 -; GFX9-NEXT: v_mov_b32_e32 v22, v12 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 -; GFX9-NEXT: v_mov_b32_e32 v35, v8 -; GFX9-NEXT: v_mov_b32_e32 v20, v6 -; GFX9-NEXT: v_mov_b32_e32 v28, v4 -; GFX9-NEXT: v_mov_b32_e32 v26, v2 -; GFX9-NEXT: v_mov_b32_e32 v24, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v38, v24 +; GFX9-NEXT: v_mov_b32_e32 v48, v20 +; GFX9-NEXT: v_mov_b32_e32 v33, v16 +; GFX9-NEXT: v_mov_b32_e32 v50, v14 +; GFX9-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-NEXT: v_mov_b32_e32 v49, v10 +; GFX9-NEXT: v_mov_b32_e32 v20, v8 +; GFX9-NEXT: v_mov_b32_e32 v35, v6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 @@ -94666,17 +94436,17 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v54, 8, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v55, 8, v15 @@ -94687,72 +94457,78 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 ; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v32 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v31 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(22) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v4 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v14 ; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v24 ; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v36 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v36 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v38 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v28 ; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v0, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v4, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v20, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v31, v5 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v22, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v18, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v37, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 @@ -94761,43 +94537,46 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_and_b32 s5, s20, 0xff -; GFX9-NEXT: s_lshl_b32 s6, s21, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s20, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s21, 8 ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s22, 0xff +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v43, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s6, s6, s7 +; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v46, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_and_b32 s6, s24, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s25, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s4, s5 +; GFX9-NEXT: s_and_b32 s4, s24, 0xff +; GFX9-NEXT: s_lshl_b32 s5, s25, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v45, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: s_and_b32 s7, s26, 0xff +; GFX9-NEXT: s_or_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s5, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_or_b32 s5, s5, s8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v56, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v39, v35 +; GFX9-NEXT: v_mov_b32_e32 v28, v20 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v31, v22 +; GFX9-NEXT: v_mov_b32_e32 v26, v38 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: s_cbranch_execnz .LBB111_3 ; GFX9-NEXT: .LBB111_2: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 ; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 @@ -94811,55 +94590,51 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v40 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 ; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 ; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v52 ; GFX9-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v3 +; GFX9-NEXT: v_mov_b32_e32 v20, v28 +; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 ; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v31 ; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v26 ; GFX9-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 ; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v50 ; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 ; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 ; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 @@ -94881,16 +94656,18 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: s_lshl_b32 s9, s23, 8 ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_or_b32 s8, s9, s8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: s_and_b32 s9, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 ; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_and_b32 s10, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s19, 8 -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 ; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_or_b32 s10, s11, s10 @@ -94908,13 +94685,11 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 @@ -94927,19 +94702,29 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v8, v18, 16, v8 ; GFX9-NEXT: v_lshl_or_b32 v9, v21, 16, v9 ; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v11, v29, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v11, v28, 16, v11 ; GFX9-NEXT: v_lshl_or_b32 v12, v24, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 ; GFX9-NEXT: v_or_b32_sdwa v3, v22, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -94967,7 +94752,11 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB111_4: -; GFX9-NEXT: v_mov_b32_e32 v31, v5 +; GFX9-NEXT: v_mov_b32_e32 v39, v35 +; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_mov_b32_e32 v28, v20 +; GFX9-NEXT: v_mov_b32_e32 v31, v22 +; GFX9-NEXT: v_mov_b32_e32 v26, v38 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB111_2 ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index fe226fa0bb47f..ef749961f8173 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -2852,8 +2852,8 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s75, s19, 16 ; SI-NEXT: s_lshr_b32 s76, s21, 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s23, s56, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s23, s56, 16 ; SI-NEXT: s_or_b32 s20, s20, s23 ; SI-NEXT: v_mov_b32_e32 v1, s20 ; SI-NEXT: s_and_b32 s20, s21, 0xffff @@ -4457,12 +4457,12 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -4525,34 +4525,33 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s4, s4, 0x30000 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 @@ -4567,6 +4566,7 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; VI-NEXT: s_add_i32 s8, s8, 0x30000 ; VI-NEXT: s_add_i32 s7, s7, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -4633,11 +4633,11 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -4655,16 +4655,14 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v13, s19 ; GFX9-NEXT: s_cbranch_execnz .LBB15_3 ; GFX9-NEXT: .LBB15_2: ; %cmp.true -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -4679,6 +4677,8 @@ define inreg <18 x i32> @bitcast_v36i16_to_v18i32_scalar(<36 x i16> inreg %a, i3 ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: .LBB15_3: ; %end @@ -6506,122 +6506,122 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 @@ -7196,8 +7196,8 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s23 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 @@ -7213,7 +7213,7 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 @@ -7230,7 +7230,7 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v2, v45, v2 ; SI-NEXT: v_or_b32_e32 v3, v63, v3 ; SI-NEXT: v_or_b32_e32 v4, v41, v4 ; SI-NEXT: v_or_b32_e32 v5, v55, v5 @@ -7249,119 +7249,120 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 @@ -7433,9 +7434,9 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v39, v24 ; SI-NEXT: v_mov_b32_e32 v43, v48 ; SI-NEXT: v_mov_b32_e32 v48, v25 -; SI-NEXT: v_mov_b32_e32 v32, v44 ; SI-NEXT: v_mov_b32_e32 v44, v49 ; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v32, v45 ; SI-NEXT: v_mov_b32_e32 v45, v50 ; SI-NEXT: v_mov_b32_e32 v50, v27 ; SI-NEXT: v_mov_b32_e32 v46, v51 @@ -7463,9 +7464,9 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v51, v46 ; SI-NEXT: v_mov_b32_e32 v27, v50 ; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v32 ; SI-NEXT: v_mov_b32_e32 v26, v49 ; SI-NEXT: v_mov_b32_e32 v49, v44 -; SI-NEXT: v_mov_b32_e32 v44, v32 ; SI-NEXT: v_mov_b32_e32 v25, v48 ; SI-NEXT: v_mov_b32_e32 v48, v43 ; SI-NEXT: s_branch .LBB19_2 @@ -7538,12 +7539,12 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -7564,13 +7565,13 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; VI-NEXT: .LBB19_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -7683,11 +7684,11 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -7709,9 +7710,9 @@ define inreg <18 x i32> @bitcast_v36f16_to_v18i32_scalar(<36 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 @@ -9935,6 +9936,7 @@ define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, ; SI-NEXT: v_lshr_b64 v[25:26], v[12:13], 16 ; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 ; SI-NEXT: v_lshr_b64 v[27:28], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 @@ -9943,8 +9945,7 @@ define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 @@ -9982,21 +9983,21 @@ define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v18, v18, v28 ; SI-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 ; SI-NEXT: v_or_b32_e32 v18, v18, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v16, v16, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v16, v18, s[0:3], 0 offen @@ -10006,9 +10007,9 @@ define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen @@ -10094,7 +10095,7 @@ define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -10105,12 +10106,12 @@ define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v18f32_to_v36i16_scalar: @@ -11672,12 +11673,12 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -11740,34 +11741,33 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s4, s4, 0x30000 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 @@ -11782,6 +11782,7 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; VI-NEXT: s_add_i32 s8, s8, 0x30000 ; VI-NEXT: s_add_i32 s7, s7, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -11848,11 +11849,11 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -11870,16 +11871,14 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v13, s19 ; GFX9-NEXT: s_cbranch_execnz .LBB31_3 ; GFX9-NEXT: .LBB31_2: ; %cmp.true -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -11894,6 +11893,8 @@ define inreg <18 x float> @bitcast_v36i16_to_v18f32_scalar(<36 x i16> inreg %a, ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: .LBB31_3: ; %end @@ -13800,122 +13801,122 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 @@ -14490,8 +14491,8 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s23 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 @@ -14507,7 +14508,7 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 @@ -14524,7 +14525,7 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v2, v45, v2 ; SI-NEXT: v_or_b32_e32 v3, v63, v3 ; SI-NEXT: v_or_b32_e32 v4, v41, v4 ; SI-NEXT: v_or_b32_e32 v5, v55, v5 @@ -14543,119 +14544,120 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 @@ -14727,9 +14729,9 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v39, v24 ; SI-NEXT: v_mov_b32_e32 v43, v48 ; SI-NEXT: v_mov_b32_e32 v48, v25 -; SI-NEXT: v_mov_b32_e32 v32, v44 ; SI-NEXT: v_mov_b32_e32 v44, v49 ; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v32, v45 ; SI-NEXT: v_mov_b32_e32 v45, v50 ; SI-NEXT: v_mov_b32_e32 v50, v27 ; SI-NEXT: v_mov_b32_e32 v46, v51 @@ -14757,9 +14759,9 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v51, v46 ; SI-NEXT: v_mov_b32_e32 v27, v50 ; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v32 ; SI-NEXT: v_mov_b32_e32 v26, v49 ; SI-NEXT: v_mov_b32_e32 v49, v44 -; SI-NEXT: v_mov_b32_e32 v44, v32 ; SI-NEXT: v_mov_b32_e32 v25, v48 ; SI-NEXT: v_mov_b32_e32 v48, v43 ; SI-NEXT: s_branch .LBB35_2 @@ -14832,12 +14834,12 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -14858,13 +14860,13 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -14977,11 +14979,11 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -15003,9 +15005,9 @@ define inreg <18 x float> @bitcast_v36f16_to_v18f32_scalar(<36 x half> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 @@ -16523,8 +16525,8 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 ; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_lshl_b32 s23, s56, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s23, s56, 16 ; SI-NEXT: s_or_b32 s20, s20, s23 ; SI-NEXT: v_mov_b32_e32 v1, s20 ; SI-NEXT: s_and_b32 s20, s21, 0xffff @@ -18128,12 +18130,12 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -18196,34 +18198,33 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s4, s4, 0x30000 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 @@ -18238,6 +18239,7 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; VI-NEXT: s_add_i32 s8, s8, 0x30000 ; VI-NEXT: s_add_i32 s7, s7, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -18304,11 +18306,11 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -18326,16 +18328,14 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; GFX9-NEXT: v_mov_b32_e32 v13, s19 ; GFX9-NEXT: s_cbranch_execnz .LBB43_3 ; GFX9-NEXT: .LBB43_2: ; %cmp.true -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -18350,6 +18350,8 @@ define inreg <9 x i64> @bitcast_v36i16_to_v9i64_scalar(<36 x i16> inreg %a, i32 ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: .LBB43_3: ; %end @@ -20187,122 +20189,122 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 @@ -20877,8 +20879,8 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s23 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 @@ -20894,7 +20896,7 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 @@ -20911,7 +20913,7 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v2, v45, v2 ; SI-NEXT: v_or_b32_e32 v3, v63, v3 ; SI-NEXT: v_or_b32_e32 v4, v41, v4 ; SI-NEXT: v_or_b32_e32 v5, v55, v5 @@ -20930,119 +20932,120 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 @@ -21114,9 +21117,9 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v39, v24 ; SI-NEXT: v_mov_b32_e32 v43, v48 ; SI-NEXT: v_mov_b32_e32 v48, v25 -; SI-NEXT: v_mov_b32_e32 v32, v44 ; SI-NEXT: v_mov_b32_e32 v44, v49 ; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v32, v45 ; SI-NEXT: v_mov_b32_e32 v45, v50 ; SI-NEXT: v_mov_b32_e32 v50, v27 ; SI-NEXT: v_mov_b32_e32 v46, v51 @@ -21144,9 +21147,9 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v51, v46 ; SI-NEXT: v_mov_b32_e32 v27, v50 ; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v32 ; SI-NEXT: v_mov_b32_e32 v26, v49 ; SI-NEXT: v_mov_b32_e32 v49, v44 -; SI-NEXT: v_mov_b32_e32 v44, v32 ; SI-NEXT: v_mov_b32_e32 v25, v48 ; SI-NEXT: v_mov_b32_e32 v48, v43 ; SI-NEXT: s_branch .LBB47_2 @@ -21219,12 +21222,12 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -21245,13 +21248,13 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -21364,11 +21367,11 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -21390,9 +21393,9 @@ define inreg <9 x i64> @bitcast_v36f16_to_v9i64_scalar(<36 x half> inreg %a, i32 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 @@ -21528,12 +21531,12 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -21558,14 +21561,14 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 ; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 ; SI-NEXT: .LBB48_2: ; %Flow @@ -21589,21 +21592,21 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 ; SI-NEXT: v_alignbit_b32 v29, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v34, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v34 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v33 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 @@ -21631,7 +21634,7 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -22091,6 +22094,7 @@ define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i ; SI-NEXT: v_lshr_b64 v[25:26], v[12:13], 16 ; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 ; SI-NEXT: v_lshr_b64 v[27:28], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 @@ -22099,8 +22103,7 @@ define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 -; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 @@ -22129,21 +22132,21 @@ define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v18, v18, v28 ; SI-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 ; SI-NEXT: v_or_b32_e32 v18, v18, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v16, v16, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v16, v18, s[0:3], 0 offen @@ -22153,9 +22156,9 @@ define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen @@ -22241,7 +22244,7 @@ define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr26 @@ -22252,12 +22255,12 @@ define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v9f64_to_v36i16_scalar: @@ -23786,12 +23789,12 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -23854,34 +23857,33 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s4, s4, 0x30000 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 @@ -23896,6 +23898,7 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; VI-NEXT: s_add_i32 s8, s8, 0x30000 ; VI-NEXT: s_add_i32 s7, s7, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -23962,11 +23965,11 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -23984,16 +23987,14 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; GFX9-NEXT: v_mov_b32_e32 v13, s19 ; GFX9-NEXT: s_cbranch_execnz .LBB51_3 ; GFX9-NEXT: .LBB51_2: ; %cmp.true -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v35 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v39, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v38, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -24008,6 +24009,8 @@ define inreg <9 x double> @bitcast_v36i16_to_v9f64_scalar(<36 x i16> inreg %a, i ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: .LBB51_3: ; %end @@ -25840,122 +25843,122 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v58 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v59 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v45 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v43 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v42 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v36 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 @@ -26530,8 +26533,8 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v33, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v60, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s23 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v63, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 @@ -26547,7 +26550,7 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v59 @@ -26564,7 +26567,7 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_or_b32_e32 v2, v61, v2 +; SI-NEXT: v_or_b32_e32 v2, v45, v2 ; SI-NEXT: v_or_b32_e32 v3, v63, v3 ; SI-NEXT: v_or_b32_e32 v4, v41, v4 ; SI-NEXT: v_or_b32_e32 v5, v55, v5 @@ -26583,119 +26586,120 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v42 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v40 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v58 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 ; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v8, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v51 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v14, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v28 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 @@ -26767,9 +26771,9 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v39, v24 ; SI-NEXT: v_mov_b32_e32 v43, v48 ; SI-NEXT: v_mov_b32_e32 v48, v25 -; SI-NEXT: v_mov_b32_e32 v32, v44 ; SI-NEXT: v_mov_b32_e32 v44, v49 ; SI-NEXT: v_mov_b32_e32 v49, v26 +; SI-NEXT: v_mov_b32_e32 v32, v45 ; SI-NEXT: v_mov_b32_e32 v45, v50 ; SI-NEXT: v_mov_b32_e32 v50, v27 ; SI-NEXT: v_mov_b32_e32 v46, v51 @@ -26797,9 +26801,9 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v51, v46 ; SI-NEXT: v_mov_b32_e32 v27, v50 ; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v32 ; SI-NEXT: v_mov_b32_e32 v26, v49 ; SI-NEXT: v_mov_b32_e32 v49, v44 -; SI-NEXT: v_mov_b32_e32 v44, v32 ; SI-NEXT: v_mov_b32_e32 v25, v48 ; SI-NEXT: v_mov_b32_e32 v48, v43 ; SI-NEXT: s_branch .LBB55_2 @@ -26872,12 +26876,12 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v14, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -26898,13 +26902,13 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; VI-NEXT: .LBB55_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -27017,11 +27021,11 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -27043,9 +27047,9 @@ define inreg <9 x double> @bitcast_v36f16_to_v9f64_scalar(<36 x half> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v39, 16, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v15, v38, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v37, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v36, 16, v17 @@ -29571,264 +29575,264 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v22 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v45, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v41, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v42, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v55, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v40, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v44 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_or_b32_e32 v2, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v21, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v4, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v18, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_or_b32_e32 v6, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v8, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_or_b32_e32 v23, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_or_b32_e32 v25, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v13, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: v_or_b32_e32 v15, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v16, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_or_b32_e32 v45, v3, v16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v14, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_or_b32_e32 v44, v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 -; SI-NEXT: v_or_b32_e32 v10, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v42, v3, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_or_b32_e32 v40, v5, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: v_or_b32_e32 v8, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_or_b32_e32 v54, v1, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_or_b32_e32 v6, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_or_b32_e32 v52, v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_or_b32_e32 v50, v10, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v4, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v38 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_or_b32_e32 v2, v2, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 -; SI-NEXT: v_or_b32_e32 v43, v12, v17 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v45, v11, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_or_b32_e32 v42, v12, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 -; SI-NEXT: v_or_b32_e32 v54, v19, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_or_b32_e32 v40, v11, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v51 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v53, v12, v7 -; SI-NEXT: v_or_b32_e32 v51, v11, v5 -; SI-NEXT: v_or_b32_e32 v48, v19, v3 -; SI-NEXT: v_or_b32_e32 v38, v22, v1 -; SI-NEXT: v_lshr_b64 v[34:35], v[20:21], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[22:23], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[11:12], v[1:2], 16 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v48, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_or_b32_e32 v38, v10, v1 +; SI-NEXT: v_lshr_b64 v[34:35], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[1:2], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v45 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -29839,8 +29843,8 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -29852,7 +29856,7 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -29864,7 +29868,7 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v38 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index 78c1971c50d14..03328ef1101d3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -2010,8 +2010,8 @@ define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, ; SI-LABEL: bitcast_v10f64_to_v20i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v19, v5 ; SI-NEXT: v_mov_b32_e32 v18, v4 ; SI-NEXT: v_mov_b32_e32 v17, v3 @@ -2055,8 +2055,8 @@ define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, ; VI-LABEL: bitcast_v10f64_to_v20i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v12, v6 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v19, v5 ; VI-NEXT: v_mov_b32_e32 v18, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 @@ -2100,8 +2100,8 @@ define inreg <20 x i32> @bitcast_v10f64_to_v20i32_scalar(<10 x double> inreg %a, ; GFX9-LABEL: bitcast_v10f64_to_v20i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v12, v6 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 ; GFX9-NEXT: v_mov_b32_e32 v18, v4 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 @@ -2924,6 +2924,9 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[42:43], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[22:23], 16 ; SI-NEXT: s_lshr_b32 s72, s5, 16 ; SI-NEXT: s_lshr_b32 s73, s7, 16 ; SI-NEXT: s_lshr_b32 s74, s9, 16 @@ -2934,24 +2937,21 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s79, s19, 16 ; SI-NEXT: s_lshr_b32 s88, s21, 16 ; SI-NEXT: s_lshr_b32 s89, s23, 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[22:23], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s25, s60, 16 ; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s25, s60, 16 ; SI-NEXT: s_or_b32 s22, s22, s25 ; SI-NEXT: v_mov_b32_e32 v1, s22 ; SI-NEXT: s_and_b32 s22, s23, 0xffff ; SI-NEXT: s_lshl_b32 s23, s89, 16 ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: v_mov_b32_e32 v2, s22 -; SI-NEXT: s_lshl_b32 s22, s58, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_or_b32 s20, s20, s22 +; SI-NEXT: s_lshl_b32 s22, s58, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s20 @@ -4757,21 +4757,21 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -4818,42 +4818,42 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -5101,79 +5101,79 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 ; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB15_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 @@ -7338,126 +7338,127 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -7465,24 +7466,24 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -8095,7 +8096,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 @@ -8115,26 +8116,26 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 @@ -8143,7 +8144,7 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 @@ -8153,12 +8154,12 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 ; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 ; SI-NEXT: v_or_b32_e32 v7, v47, v7 ; SI-NEXT: v_or_b32_e32 v8, v62, v8 @@ -8175,164 +8176,165 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 @@ -8358,20 +8360,20 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v42, v48 ; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v58, v49 ; SI-NEXT: v_mov_b32_e32 v49, v20 ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v43, v50 ; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v44, v51 ; SI-NEXT: v_mov_b32_e32 v51, v23 @@ -8379,22 +8381,21 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v52, v27 ; SI-NEXT: v_mov_b32_e32 v46, v53 ; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v33 ; SI-NEXT: v_mov_b32_e32 v41, v32 ; SI-NEXT: v_mov_b32_e32 v33, v47 ; SI-NEXT: v_mov_b32_e32 v47, v54 ; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: v_mov_b32_e32 v42, v56 ; SI-NEXT: v_mov_b32_e32 v56, v55 ; SI-NEXT: v_mov_b32_e32 v55, v30 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v59, v57 ; SI-NEXT: v_mov_b32_e32 v57, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 ; SI-NEXT: v_mov_b32_e32 v37, v34 ; SI-NEXT: v_mov_b32_e32 v34, v24 ; SI-NEXT: v_mov_b32_e32 v32, v38 @@ -8406,34 +8407,34 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v38, v32 ; SI-NEXT: v_mov_b32_e32 v24, v34 ; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v57 -; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 ; SI-NEXT: v_mov_b32_e32 v30, v55 ; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 ; SI-NEXT: v_mov_b32_e32 v32, v41 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v23, v51 ; SI-NEXT: v_mov_b32_e32 v51, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v22, v50 ; SI-NEXT: v_mov_b32_e32 v50, v43 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v54 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v59 ; SI-NEXT: v_mov_b32_e32 v28, v53 ; SI-NEXT: v_mov_b32_e32 v53, v46 ; SI-NEXT: v_mov_b32_e32 v27, v52 @@ -8505,21 +8506,21 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -8540,13 +8541,13 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; VI-NEXT: .LBB19_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -8701,8 +8702,8 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 -; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 @@ -8818,79 +8819,79 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 ; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB19_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 @@ -10334,8 +10335,8 @@ define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg % ; SI-LABEL: bitcast_v10f64_to_v20f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v19, v5 ; SI-NEXT: v_mov_b32_e32 v18, v4 ; SI-NEXT: v_mov_b32_e32 v17, v3 @@ -10379,8 +10380,8 @@ define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg % ; VI-LABEL: bitcast_v10f64_to_v20f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v12, v6 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v19, v5 ; VI-NEXT: v_mov_b32_e32 v18, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 @@ -10424,8 +10425,8 @@ define inreg <20 x float> @bitcast_v10f64_to_v20f32_scalar(<10 x double> inreg % ; GFX9-LABEL: bitcast_v10f64_to_v20f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v12, v6 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 ; GFX9-NEXT: v_mov_b32_e32 v18, v4 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 @@ -11154,141 +11155,141 @@ define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v12, s25 ; SI-NEXT: v_mov_b32_e32 v9, s26 ; SI-NEXT: v_mov_b32_e32 v10, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v7, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v8, s29 ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 ; SI-NEXT: v_lshr_b64 v[22:23], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 ; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 ; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[17:18], 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; SI-NEXT: v_lshr_b64 v[30:31], v[19:20], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshr_b64 v[22:23], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[19:20], 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_lshr_b64 v[26:27], v[11:12], 16 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_lshr_b64 v[27:28], v[13:14], 16 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_lshr_b64 v[28:29], v[15:16], 16 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshr_b64 v[29:30], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[19:20], 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v31 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v30 ; SI-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v48 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v39 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v27 ; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v26 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 ; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v36 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen @@ -11300,7 +11301,7 @@ define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, ; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen @@ -11312,7 +11313,7 @@ define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -11324,7 +11325,7 @@ define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -11336,33 +11337,33 @@ define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v20f32_to_v40i16_scalar: @@ -13064,21 +13065,21 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -13125,42 +13126,42 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -13408,79 +13409,79 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 ; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB31_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 @@ -15674,126 +15675,127 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -15801,24 +15803,24 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -16431,7 +16433,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 @@ -16451,26 +16453,26 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 @@ -16479,7 +16481,7 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 @@ -16489,12 +16491,12 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 ; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 ; SI-NEXT: v_or_b32_e32 v7, v47, v7 ; SI-NEXT: v_or_b32_e32 v8, v62, v8 @@ -16511,164 +16513,165 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 @@ -16694,20 +16697,20 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v42, v48 ; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v58, v49 ; SI-NEXT: v_mov_b32_e32 v49, v20 ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v43, v50 ; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v44, v51 ; SI-NEXT: v_mov_b32_e32 v51, v23 @@ -16715,22 +16718,21 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v52, v27 ; SI-NEXT: v_mov_b32_e32 v46, v53 ; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v33 ; SI-NEXT: v_mov_b32_e32 v41, v32 ; SI-NEXT: v_mov_b32_e32 v33, v47 ; SI-NEXT: v_mov_b32_e32 v47, v54 ; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: v_mov_b32_e32 v42, v56 ; SI-NEXT: v_mov_b32_e32 v56, v55 ; SI-NEXT: v_mov_b32_e32 v55, v30 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v59, v57 ; SI-NEXT: v_mov_b32_e32 v57, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 ; SI-NEXT: v_mov_b32_e32 v37, v34 ; SI-NEXT: v_mov_b32_e32 v34, v24 ; SI-NEXT: v_mov_b32_e32 v32, v38 @@ -16742,34 +16744,34 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v38, v32 ; SI-NEXT: v_mov_b32_e32 v24, v34 ; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v57 -; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 ; SI-NEXT: v_mov_b32_e32 v30, v55 ; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 ; SI-NEXT: v_mov_b32_e32 v32, v41 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v23, v51 ; SI-NEXT: v_mov_b32_e32 v51, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v22, v50 ; SI-NEXT: v_mov_b32_e32 v50, v43 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v54 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v59 ; SI-NEXT: v_mov_b32_e32 v28, v53 ; SI-NEXT: v_mov_b32_e32 v53, v46 ; SI-NEXT: v_mov_b32_e32 v27, v52 @@ -16841,21 +16843,21 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -16876,13 +16878,13 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -17037,8 +17039,8 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 -; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 @@ -17154,79 +17156,79 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 ; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB35_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 @@ -17950,8 +17952,8 @@ define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, ; SI-LABEL: bitcast_v10f64_to_v10i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, v6 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; SI-NEXT: v_mov_b32_e32 v13, v6 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; SI-NEXT: v_mov_b32_e32 v19, v5 ; SI-NEXT: v_mov_b32_e32 v18, v4 ; SI-NEXT: v_mov_b32_e32 v17, v3 @@ -17995,8 +17997,8 @@ define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, ; VI-LABEL: bitcast_v10f64_to_v10i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v12, v6 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_mov_b32_e32 v13, v6 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; VI-NEXT: v_mov_b32_e32 v19, v5 ; VI-NEXT: v_mov_b32_e32 v18, v4 ; VI-NEXT: v_mov_b32_e32 v17, v3 @@ -18040,8 +18042,8 @@ define inreg <10 x i64> @bitcast_v10f64_to_v10i64_scalar(<10 x double> inreg %a, ; GFX9-LABEL: bitcast_v10f64_to_v10i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v12, v6 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_mov_b32_e32 v13, v6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 ; GFX9-NEXT: v_mov_b32_e32 v18, v4 ; GFX9-NEXT: v_mov_b32_e32 v17, v3 @@ -18888,20 +18890,20 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[58:59], s[20:21], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[22:23], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_lshl_b32 s25, s60, 16 ; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s25, s60, 16 ; SI-NEXT: s_or_b32 s22, s22, s25 ; SI-NEXT: v_mov_b32_e32 v1, s22 ; SI-NEXT: s_and_b32 s22, s23, 0xffff ; SI-NEXT: s_lshl_b32 s23, s89, 16 ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: v_mov_b32_e32 v2, s22 -; SI-NEXT: s_lshl_b32 s22, s58, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: s_or_b32 s20, s20, s22 +; SI-NEXT: s_lshl_b32 s22, s58, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s20 @@ -20707,21 +20709,21 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -20768,42 +20770,42 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -21051,79 +21053,79 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 ; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB43_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 @@ -23298,126 +23300,127 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -23425,24 +23428,24 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -24055,7 +24058,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 @@ -24075,26 +24078,26 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 @@ -24103,7 +24106,7 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 @@ -24113,12 +24116,12 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 ; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 ; SI-NEXT: v_or_b32_e32 v7, v47, v7 ; SI-NEXT: v_or_b32_e32 v8, v62, v8 @@ -24135,164 +24138,165 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 @@ -24318,20 +24322,20 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v42, v48 ; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v58, v49 ; SI-NEXT: v_mov_b32_e32 v49, v20 ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v43, v50 ; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v44, v51 ; SI-NEXT: v_mov_b32_e32 v51, v23 @@ -24339,22 +24343,21 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v52, v27 ; SI-NEXT: v_mov_b32_e32 v46, v53 ; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v33 ; SI-NEXT: v_mov_b32_e32 v41, v32 ; SI-NEXT: v_mov_b32_e32 v33, v47 ; SI-NEXT: v_mov_b32_e32 v47, v54 ; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: v_mov_b32_e32 v42, v56 ; SI-NEXT: v_mov_b32_e32 v56, v55 ; SI-NEXT: v_mov_b32_e32 v55, v30 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v59, v57 ; SI-NEXT: v_mov_b32_e32 v57, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 ; SI-NEXT: v_mov_b32_e32 v37, v34 ; SI-NEXT: v_mov_b32_e32 v34, v24 ; SI-NEXT: v_mov_b32_e32 v32, v38 @@ -24366,34 +24369,34 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v38, v32 ; SI-NEXT: v_mov_b32_e32 v24, v34 ; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v57 -; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 ; SI-NEXT: v_mov_b32_e32 v30, v55 ; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 ; SI-NEXT: v_mov_b32_e32 v32, v41 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v23, v51 ; SI-NEXT: v_mov_b32_e32 v51, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v22, v50 ; SI-NEXT: v_mov_b32_e32 v50, v43 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v54 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v59 ; SI-NEXT: v_mov_b32_e32 v28, v53 ; SI-NEXT: v_mov_b32_e32 v53, v46 ; SI-NEXT: v_mov_b32_e32 v27, v52 @@ -24465,21 +24468,21 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -24500,13 +24503,13 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -24661,8 +24664,8 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 -; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 @@ -24778,79 +24781,79 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 ; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB47_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 @@ -25674,131 +25677,131 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v12, s25 ; SI-NEXT: v_mov_b32_e32 v9, s26 ; SI-NEXT: v_mov_b32_e32 v10, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v7, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v8, s29 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 ; SI-NEXT: v_lshr_b64 v[22:23], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 ; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 ; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[17:18], 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 +; SI-NEXT: v_lshr_b64 v[30:31], v[19:20], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_lshr_b64 v[22:23], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[19:20], 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_lshr_b64 v[26:27], v[11:12], 16 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_lshr_b64 v[27:28], v[13:14], 16 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_lshr_b64 v[28:29], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[19:20], 16 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v31 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v19, v19, v30 ; SI-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v48 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v39 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v27 ; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v26 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 ; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v36 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen @@ -25810,7 +25813,7 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen @@ -25822,7 +25825,7 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -25834,7 +25837,7 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -25846,33 +25849,33 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v10f64_to_v40i16_scalar: @@ -27554,21 +27557,21 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -27615,42 +27618,42 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -27898,79 +27901,79 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 ; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB51_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 @@ -28943,21 +28946,21 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_mov_b32_e32 v20, s16 -; SI-NEXT: v_mov_b32_e32 v21, s17 -; SI-NEXT: v_mov_b32_e32 v18, s18 -; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_mov_b32_e32 v17, s21 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: v_mov_b32_e32 v13, s25 -; SI-NEXT: v_mov_b32_e32 v8, s26 -; SI-NEXT: v_mov_b32_e32 v9, s27 +; SI-NEXT: v_mov_b32_e32 v19, s16 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_mov_b32_e32 v17, s18 +; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: v_mov_b32_e32 v16, s21 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s27 +; SI-NEXT: v_mov_b32_e32 v7, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v10, s28 -; SI-NEXT: v_mov_b32_e32 v11, s29 +; SI-NEXT: v_mov_b32_e32 v8, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill @@ -28981,39 +28984,39 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v28, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v58, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v60, v23 @@ -29023,74 +29026,74 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v29, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v19 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[10:11], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 @@ -29098,7 +29101,7 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 @@ -29243,7 +29246,7 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -29304,7 +29307,7 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v10f64_to_v40f16_scalar: @@ -30098,126 +30101,127 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v56 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v44 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v41 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v48 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -30225,24 +30229,24 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v13, v15, v13 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -30855,7 +30859,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v8 @@ -30875,26 +30879,26 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v37, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v36, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v34, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 @@ -30903,7 +30907,7 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v63 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v54 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 @@ -30913,12 +30917,12 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v2, v56, v2 +; SI-NEXT: v_or_b32_e32 v2, v42, v2 ; SI-NEXT: v_or_b32_e32 v3, v37, v3 ; SI-NEXT: v_or_b32_e32 v4, v35, v4 -; SI-NEXT: v_or_b32_e32 v5, v61, v5 +; SI-NEXT: v_or_b32_e32 v5, v33, v5 ; SI-NEXT: v_or_b32_e32 v6, v24, v6 ; SI-NEXT: v_or_b32_e32 v7, v47, v7 ; SI-NEXT: v_or_b32_e32 v8, v62, v8 @@ -30935,164 +30939,165 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v58 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v61 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v42 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v42 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v25 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 ; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v41 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v55 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v51 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v50 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v48 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v27 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 @@ -31118,20 +31123,20 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_mov_b32_e32 v42, v48 ; SI-NEXT: v_mov_b32_e32 v48, v21 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v49 +; SI-NEXT: v_mov_b32_e32 v58, v49 ; SI-NEXT: v_mov_b32_e32 v49, v20 ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v43, v50 ; SI-NEXT: v_mov_b32_e32 v50, v22 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v44, v51 ; SI-NEXT: v_mov_b32_e32 v51, v23 @@ -31139,22 +31144,21 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_mov_b32_e32 v52, v27 ; SI-NEXT: v_mov_b32_e32 v46, v53 ; SI-NEXT: v_mov_b32_e32 v53, v28 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v33 ; SI-NEXT: v_mov_b32_e32 v41, v32 ; SI-NEXT: v_mov_b32_e32 v33, v47 ; SI-NEXT: v_mov_b32_e32 v47, v54 ; SI-NEXT: v_mov_b32_e32 v54, v29 -; SI-NEXT: v_mov_b32_e32 v42, v56 ; SI-NEXT: v_mov_b32_e32 v56, v55 ; SI-NEXT: v_mov_b32_e32 v55, v30 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v57 +; SI-NEXT: v_mov_b32_e32 v59, v57 ; SI-NEXT: v_mov_b32_e32 v57, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v58 -; SI-NEXT: v_mov_b32_e32 v58, v37 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_mov_b32_e32 v39, v37 ; SI-NEXT: v_mov_b32_e32 v37, v34 ; SI-NEXT: v_mov_b32_e32 v34, v24 ; SI-NEXT: v_mov_b32_e32 v32, v38 @@ -31166,34 +31170,34 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; SI-NEXT: v_mov_b32_e32 v38, v32 ; SI-NEXT: v_mov_b32_e32 v24, v34 ; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v58 -; SI-NEXT: v_mov_b32_e32 v58, v39 -; SI-NEXT: v_mov_b32_e32 v31, v40 -; SI-NEXT: v_mov_b32_e32 v40, v57 -; SI-NEXT: v_mov_b32_e32 v57, v36 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v39, v36 ; SI-NEXT: v_mov_b32_e32 v30, v55 ; SI-NEXT: v_mov_b32_e32 v55, v56 -; SI-NEXT: v_mov_b32_e32 v56, v42 +; SI-NEXT: v_mov_b32_e32 v29, v54 +; SI-NEXT: v_mov_b32_e32 v54, v47 +; SI-NEXT: v_mov_b32_e32 v47, v33 ; SI-NEXT: v_mov_b32_e32 v32, v41 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v23, v51 ; SI-NEXT: v_mov_b32_e32 v51, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v22, v50 ; SI-NEXT: v_mov_b32_e32 v50, v43 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v20, v49 -; SI-NEXT: v_mov_b32_e32 v49, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v49, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v21, v48 -; SI-NEXT: v_mov_b32_e32 v48, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v54 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v47, v33 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v57 +; SI-NEXT: v_mov_b32_e32 v57, v59 ; SI-NEXT: v_mov_b32_e32 v28, v53 ; SI-NEXT: v_mov_b32_e32 v53, v46 ; SI-NEXT: v_mov_b32_e32 v27, v52 @@ -31265,21 +31269,21 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 ; VI-NEXT: v_or_b32_sdwa v14, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v17, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -31300,13 +31304,13 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; VI-NEXT: .LBB55_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -31461,8 +31465,8 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v32 -; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v16 @@ -31578,79 +31582,79 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v185, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v186, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 ; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB55_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90 @@ -32805,59 +32809,58 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v19 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v62, s16 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v63, s18 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s21 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s22 ; SI-NEXT: v_cvt_f32_f16_e32 v61, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 ; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v58, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 @@ -32906,33 +32909,33 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v29, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s22 ; SI-NEXT: v_cvt_f32_f16_e32 v61, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 ; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v32, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v23 @@ -32963,84 +32966,84 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -33049,7 +33052,7 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -33058,7 +33061,7 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -33105,49 +33108,49 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; kill: killed $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v40i16_to_v40f16_scalar: @@ -34584,367 +34587,369 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-LABEL: bitcast_v40f16_to_v40i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v57, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v47, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v46, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v45, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v59, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v47, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v40 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v59, v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v45, v22, v17 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v54 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 -; SI-NEXT: v_or_b32_e32 v54, v22, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_or_b32_e32 v58, v3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v57, v3, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v58, v1, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v46 +; SI-NEXT: v_or_b32_e32 v56, v5, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v44 +; SI-NEXT: v_or_b32_e32 v46, v3, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v43, v3, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_or_b32_e32 v47, v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 +; SI-NEXT: v_or_b32_e32 v44, v5, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v41, v3, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v54 +; SI-NEXT: v_or_b32_e32 v40, v5, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v53 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_or_b32_e32 v54, v3, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v53, v16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v16 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v40, v3, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v60, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; SI-NEXT: v_or_b32_e32 v4, v4, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v41, v1, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v28 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v15 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_or_b32_e32 v51, v22, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v49 -; SI-NEXT: v_or_b32_e32 v4, v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v52 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; SI-NEXT: v_or_b32_e32 v6, v6, v21 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; SI-NEXT: v_or_b32_e32 v8, v8, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v44 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 -; SI-NEXT: v_or_b32_e32 v10, v10, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v21 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v21 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v44 -; SI-NEXT: v_or_b32_e32 v14, v14, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v42 -; SI-NEXT: v_or_b32_e32 v18, v18, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v52 +; SI-NEXT: v_or_b32_e32 v6, v6, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v51 +; SI-NEXT: v_or_b32_e32 v8, v8, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v43 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v55 +; SI-NEXT: v_or_b32_e32 v28, v16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v42 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v43 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v30, v15, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v45 +; SI-NEXT: v_or_b32_e32 v32, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v57 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v22 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v46 -; SI-NEXT: v_or_b32_e32 v16, v16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v59 -; SI-NEXT: v_or_b32_e32 v12, v12, v22 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 16 -; SI-NEXT: v_or_b32_e32 v50, v23, v1 -; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[26:27], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[23:24], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[21:22], v[1:2], 16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v45 +; SI-NEXT: v_or_b32_e32 v12, v12, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v57 +; SI-NEXT: v_or_b32_e32 v10, v10, v16 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[1:2], 16 +; SI-NEXT: v_or_b32_e32 v1, v60, v1 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v35 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v50 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 9cff9c4a9dc65..714f3c69102e1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -2129,8 +2129,8 @@ define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, ; SI-LABEL: bitcast_v11f64_to_v22i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 @@ -2177,8 +2177,8 @@ define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, ; VI-LABEL: bitcast_v11f64_to_v22i32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v10, v8 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 @@ -2225,8 +2225,8 @@ define inreg <22 x i32> @bitcast_v11f64_to_v22i32_scalar(<11 x double> inreg %a, ; GFX9-LABEL: bitcast_v11f64_to_v22i32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 @@ -3113,6 +3113,11 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 ; SI-NEXT: s_lshr_b64 s[44:45], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 ; SI-NEXT: s_lshr_b32 s76, s5, 16 ; SI-NEXT: s_lshr_b32 s77, s7, 16 ; SI-NEXT: s_lshr_b32 s78, s9, 16 @@ -3124,43 +3129,40 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s92, s21, 16 ; SI-NEXT: s_lshr_b32 s93, s23, 16 ; SI-NEXT: s_lshr_b32 s94, s25, 16 -; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 -; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s27, s72, 16 ; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s27, s72, 16 ; SI-NEXT: s_or_b32 s24, s24, s27 ; SI-NEXT: v_mov_b32_e32 v1, s24 ; SI-NEXT: s_and_b32 s24, s25, 0xffff ; SI-NEXT: s_lshl_b32 s25, s94, 16 ; SI-NEXT: s_or_b32 s24, s24, s25 ; SI-NEXT: v_mov_b32_e32 v2, s24 -; SI-NEXT: s_lshl_b32 s24, s62, 16 ; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s62, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: v_mov_b32_e32 v3, s22 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s22 ; SI-NEXT: s_and_b32 s22, s23, 0xffff ; SI-NEXT: s_lshl_b32 s23, s93, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: v_mov_b32_e32 v4, s22 -; SI-NEXT: s_lshl_b32 s22, s60, 16 -; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s60, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s20 ; SI-NEXT: s_and_b32 s20, s21, 0xffff ; SI-NEXT: s_lshl_b32 s21, s92, 16 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 ; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -4900,6 +4902,8 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 ; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v8, v0, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: v_or_b32_e32 v9, v0, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 @@ -4909,45 +4913,43 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v12, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v20, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v60 ; SI-NEXT: v_or_b32_e32 v21, v0, v55 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -5142,31 +5144,31 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -5197,66 +5199,66 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff ; VI-NEXT: s_lshl_b32 s17, s41, 16 ; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s16, s17, s16 ; VI-NEXT: s_and_b32 s17, s19, 0xffff ; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s17, s18, s17 ; VI-NEXT: s_and_b32 s18, s20, 0xffff ; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s15, s15, s18 ; VI-NEXT: s_and_b32 s18, s21, 0xffff ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -5361,6 +5363,8 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -5370,10 +5374,8 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -5391,24 +5393,22 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v13, s19 ; GFX9-NEXT: s_cbranch_execnz .LBB15_3 ; GFX9-NEXT: .LBB15_2: ; %cmp.true -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v38 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -5423,6 +5423,8 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -5521,81 +5523,81 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 ; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB15_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -7985,142 +7987,140 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -8819,15 +8819,15 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 @@ -8851,16 +8851,16 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 @@ -8868,21 +8868,20 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 @@ -8897,16 +8896,17 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v3, v62, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 ; SI-NEXT: v_or_b32_e32 v5, v28, v5 ; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 ; SI-NEXT: v_or_b32_e32 v11, v59, v11 ; SI-NEXT: v_or_b32_e32 v12, v57, v12 ; SI-NEXT: v_or_b32_e32 v13, v47, v13 @@ -8920,66 +8920,64 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 @@ -8993,53 +8991,55 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 @@ -9123,88 +9123,91 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v59, v46 ; SI-NEXT: v_mov_b32_e32 v46, v41 ; SI-NEXT: v_mov_b32_e32 v41, v52 ; SI-NEXT: v_mov_b32_e32 v52, v23 -; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v60, v47 ; SI-NEXT: v_mov_b32_e32 v47, v42 ; SI-NEXT: v_mov_b32_e32 v42, v53 ; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: v_mov_b32_e32 v38, v61 ; SI-NEXT: v_mov_b32_e32 v61, v56 ; SI-NEXT: v_mov_b32_e32 v56, v43 ; SI-NEXT: v_mov_b32_e32 v43, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v39, v62 ; SI-NEXT: v_mov_b32_e32 v62, v57 ; SI-NEXT: v_mov_b32_e32 v57, v44 ; SI-NEXT: v_mov_b32_e32 v44, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v33, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v63 ; SI-NEXT: v_mov_b32_e32 v63, v58 ; SI-NEXT: v_mov_b32_e32 v58, v45 ; SI-NEXT: v_mov_b32_e32 v45, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v38, v27 -; SI-NEXT: v_mov_b32_e32 v37, v28 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: v_mov_b32_e32 v32, v49 +; SI-NEXT: v_mov_b32_e32 v49, v34 +; SI-NEXT: v_mov_b32_e32 v34, v48 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v36, v28 +; SI-NEXT: v_mov_b32_e32 v35, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v28, v37 -; SI-NEXT: v_mov_b32_e32 v27, v38 -; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v29, v35 +; SI-NEXT: v_mov_b32_e32 v28, v36 +; SI-NEXT: v_mov_b32_e32 v27, v37 +; SI-NEXT: v_mov_b32_e32 v26, v48 +; SI-NEXT: v_mov_b32_e32 v48, v34 +; SI-NEXT: v_mov_b32_e32 v34, v49 +; SI-NEXT: v_mov_b32_e32 v49, v32 ; SI-NEXT: v_mov_b32_e32 v31, v40 ; SI-NEXT: v_mov_b32_e32 v40, v45 ; SI-NEXT: v_mov_b32_e32 v45, v58 ; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v33 ; SI-NEXT: v_mov_b32_e32 v25, v55 ; SI-NEXT: v_mov_b32_e32 v55, v44 ; SI-NEXT: v_mov_b32_e32 v44, v57 ; SI-NEXT: v_mov_b32_e32 v57, v62 -; SI-NEXT: v_mov_b32_e32 v62, v34 -; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v62, v39 ; SI-NEXT: v_mov_b32_e32 v24, v54 ; SI-NEXT: v_mov_b32_e32 v54, v43 ; SI-NEXT: v_mov_b32_e32 v43, v56 ; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v35 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v42, v47 -; SI-NEXT: v_mov_b32_e32 v47, v60 -; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v61, v38 ; SI-NEXT: v_mov_b32_e32 v23, v52 ; SI-NEXT: v_mov_b32_e32 v52, v41 ; SI-NEXT: v_mov_b32_e32 v41, v46 ; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v44f16_to_v22i32_scalar: @@ -9268,31 +9271,31 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -9313,13 +9316,13 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; VI-NEXT: .LBB19_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -9452,6 +9455,8 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -9461,10 +9466,8 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -9490,9 +9493,9 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 @@ -9614,81 +9617,81 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 ; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB19_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -11216,8 +11219,8 @@ define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg % ; SI-LABEL: bitcast_v11f64_to_v22f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 @@ -11264,8 +11267,8 @@ define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg % ; VI-LABEL: bitcast_v11f64_to_v22f32_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v10, v8 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 @@ -11312,8 +11315,8 @@ define inreg <22 x float> @bitcast_v11f64_to_v22f32_scalar(<11 x double> inreg % ; GFX9-LABEL: bitcast_v11f64_to_v22f32_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 @@ -12098,8 +12101,8 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v13, s24 ; SI-NEXT: v_mov_b32_e32 v14, s25 ; SI-NEXT: v_mov_b32_e32 v11, s26 -; SI-NEXT: v_mov_b32_e32 v12, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s27 ; SI-NEXT: v_mov_b32_e32 v9, s28 ; SI-NEXT: v_mov_b32_e32 v10, s29 ; SI-NEXT: s_cbranch_scc0 .LBB29_4 @@ -12114,6 +12117,7 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_lshr_b64 v[30:31], v[15:16], 16 ; SI-NEXT: v_lshr_b64 v[31:32], v[17:18], 16 ; SI-NEXT: v_lshr_b64 v[32:33], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[21:22], 16 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 @@ -12124,8 +12128,7 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; SI-NEXT: v_lshr_b64 v[33:34], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 @@ -12171,21 +12174,21 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 ; SI-NEXT: v_or_b32_e32 v21, v21, v33 ; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v34 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen @@ -12195,9 +12198,9 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v31 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen @@ -12307,7 +12310,7 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -12316,18 +12319,18 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v22f32_to_v44i16_scalar: @@ -13973,6 +13976,8 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 ; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v8, v0, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: v_or_b32_e32 v9, v0, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 @@ -13982,45 +13987,43 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v12, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v20, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v60 ; SI-NEXT: v_or_b32_e32 v21, v0, v55 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -14215,31 +14218,31 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -14270,66 +14273,66 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff ; VI-NEXT: s_lshl_b32 s17, s41, 16 ; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s16, s17, s16 ; VI-NEXT: s_and_b32 s17, s19, 0xffff ; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s17, s18, s17 ; VI-NEXT: s_and_b32 s18, s20, 0xffff ; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s15, s15, s18 ; VI-NEXT: s_and_b32 s18, s21, 0xffff ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -14434,6 +14437,8 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -14443,10 +14448,8 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -14464,24 +14467,22 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v13, s19 ; GFX9-NEXT: s_cbranch_execnz .LBB31_3 ; GFX9-NEXT: .LBB31_2: ; %cmp.true -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v38 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -14496,6 +14497,8 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -14594,81 +14597,81 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 ; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB31_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -15797,186 +15800,130 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v12, s16 -; SI-NEXT: v_mov_b32_e32 v11, s17 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_mov_b32_e32 v10, s17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, s18 ; SI-NEXT: v_mov_b32_e32 v62, s19 ; SI-NEXT: v_mov_b32_e32 v61, s20 ; SI-NEXT: v_mov_b32_e32 v58, s21 ; SI-NEXT: v_mov_b32_e32 v56, s22 ; SI-NEXT: v_mov_b32_e32 v46, s23 -; SI-NEXT: v_mov_b32_e32 v45, s24 +; SI-NEXT: v_mov_b32_e32 v44, s24 ; SI-NEXT: v_mov_b32_e32 v43, s25 ; SI-NEXT: v_mov_b32_e32 v59, s26 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v60, s27 ; SI-NEXT: v_mov_b32_e32 v57, s28 ; SI-NEXT: v_mov_b32_e32 v47, s29 -; SI-NEXT: s_cbranch_scc0 .LBB33_2 +; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v46 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v57 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 -; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v12 -; SI-NEXT: s_branch .LBB33_3 -; SI-NEXT: .LBB33_2: -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: .LBB33_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, v14 -; SI-NEXT: v_mov_b32_e32 v14, v16 -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: s_cbranch_vccnz .LBB33_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v9, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 +; SI-NEXT: s_cbranch_execnz .LBB33_3 +; SI-NEXT: .LBB33_2: ; %cmp.true +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v61 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v46 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v63 ; SI-NEXT: v_add_f32_e32 v14, 1.0, v62 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v61 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v58 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v56 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v46 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v45 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v44 ; SI-NEXT: v_add_f32_e32 v26, 1.0, v43 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v59 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v60 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v57 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v47 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v24 @@ -15984,23 +15931,20 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 @@ -16008,22 +15952,23 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v34, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 @@ -16031,22 +15976,20 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v11 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: .LBB33_5: ; %end +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: .LBB33_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -16070,14 +16013,14 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -16085,7 +16028,7 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -16133,61 +16076,59 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -16221,6 +16162,55 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB33_4: +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v22f32_to_v44f16_scalar: ; VI: ; %bb.0: @@ -17129,142 +17119,140 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -17963,15 +17951,15 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 @@ -17995,16 +17983,16 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 @@ -18012,21 +18000,20 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 @@ -18041,16 +18028,17 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v3, v62, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 ; SI-NEXT: v_or_b32_e32 v5, v28, v5 ; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 ; SI-NEXT: v_or_b32_e32 v11, v59, v11 ; SI-NEXT: v_or_b32_e32 v12, v57, v12 ; SI-NEXT: v_or_b32_e32 v13, v47, v13 @@ -18064,66 +18052,64 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 @@ -18137,53 +18123,55 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 @@ -18267,88 +18255,91 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v59, v46 ; SI-NEXT: v_mov_b32_e32 v46, v41 ; SI-NEXT: v_mov_b32_e32 v41, v52 ; SI-NEXT: v_mov_b32_e32 v52, v23 -; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v60, v47 ; SI-NEXT: v_mov_b32_e32 v47, v42 ; SI-NEXT: v_mov_b32_e32 v42, v53 ; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: v_mov_b32_e32 v38, v61 ; SI-NEXT: v_mov_b32_e32 v61, v56 ; SI-NEXT: v_mov_b32_e32 v56, v43 ; SI-NEXT: v_mov_b32_e32 v43, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v39, v62 ; SI-NEXT: v_mov_b32_e32 v62, v57 ; SI-NEXT: v_mov_b32_e32 v57, v44 ; SI-NEXT: v_mov_b32_e32 v44, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v33, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v63 ; SI-NEXT: v_mov_b32_e32 v63, v58 ; SI-NEXT: v_mov_b32_e32 v58, v45 ; SI-NEXT: v_mov_b32_e32 v45, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v38, v27 -; SI-NEXT: v_mov_b32_e32 v37, v28 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: v_mov_b32_e32 v32, v49 +; SI-NEXT: v_mov_b32_e32 v49, v34 +; SI-NEXT: v_mov_b32_e32 v34, v48 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v36, v28 +; SI-NEXT: v_mov_b32_e32 v35, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v28, v37 -; SI-NEXT: v_mov_b32_e32 v27, v38 -; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v29, v35 +; SI-NEXT: v_mov_b32_e32 v28, v36 +; SI-NEXT: v_mov_b32_e32 v27, v37 +; SI-NEXT: v_mov_b32_e32 v26, v48 +; SI-NEXT: v_mov_b32_e32 v48, v34 +; SI-NEXT: v_mov_b32_e32 v34, v49 +; SI-NEXT: v_mov_b32_e32 v49, v32 ; SI-NEXT: v_mov_b32_e32 v31, v40 ; SI-NEXT: v_mov_b32_e32 v40, v45 ; SI-NEXT: v_mov_b32_e32 v45, v58 ; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v33 ; SI-NEXT: v_mov_b32_e32 v25, v55 ; SI-NEXT: v_mov_b32_e32 v55, v44 ; SI-NEXT: v_mov_b32_e32 v44, v57 ; SI-NEXT: v_mov_b32_e32 v57, v62 -; SI-NEXT: v_mov_b32_e32 v62, v34 -; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v62, v39 ; SI-NEXT: v_mov_b32_e32 v24, v54 ; SI-NEXT: v_mov_b32_e32 v54, v43 ; SI-NEXT: v_mov_b32_e32 v43, v56 ; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v35 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v42, v47 -; SI-NEXT: v_mov_b32_e32 v47, v60 -; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v61, v38 ; SI-NEXT: v_mov_b32_e32 v23, v52 ; SI-NEXT: v_mov_b32_e32 v52, v41 ; SI-NEXT: v_mov_b32_e32 v41, v46 ; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v44f16_to_v22f32_scalar: @@ -18412,31 +18403,31 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -18457,13 +18448,13 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -18596,6 +18587,8 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -18605,10 +18598,8 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -18634,9 +18625,9 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 @@ -18758,81 +18749,81 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 ; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB35_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -19598,8 +19589,8 @@ define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, ; SI-LABEL: bitcast_v11f64_to_v11i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; SI-NEXT: v_mov_b32_e32 v21, v7 ; SI-NEXT: v_mov_b32_e32 v20, v6 ; SI-NEXT: v_mov_b32_e32 v19, v5 @@ -19646,8 +19637,8 @@ define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, ; VI-LABEL: bitcast_v11f64_to_v11i64_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v10, v8 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: v_mov_b32_e32 v11, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; VI-NEXT: v_mov_b32_e32 v21, v7 ; VI-NEXT: v_mov_b32_e32 v20, v6 ; VI-NEXT: v_mov_b32_e32 v19, v5 @@ -19694,8 +19685,8 @@ define inreg <11 x i64> @bitcast_v11f64_to_v11i64_scalar(<11 x double> inreg %a, ; GFX9-LABEL: bitcast_v11f64_to_v11i64_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-NEXT: v_mov_b32_e32 v20, v6 ; GFX9-NEXT: v_mov_b32_e32 v19, v5 @@ -20611,37 +20602,39 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_lshl_b32 s27, s72, 16 ; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s27, s72, 16 ; SI-NEXT: s_or_b32 s24, s24, s27 ; SI-NEXT: v_mov_b32_e32 v1, s24 ; SI-NEXT: s_and_b32 s24, s25, 0xffff ; SI-NEXT: s_lshl_b32 s25, s94, 16 ; SI-NEXT: s_or_b32 s24, s24, s25 ; SI-NEXT: v_mov_b32_e32 v2, s24 -; SI-NEXT: s_lshl_b32 s24, s62, 16 ; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_lshl_b32 s24, s62, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: s_or_b32 s22, s22, s24 -; SI-NEXT: v_mov_b32_e32 v3, s22 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s22 ; SI-NEXT: s_and_b32 s22, s23, 0xffff ; SI-NEXT: s_lshl_b32 s23, s93, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; SI-NEXT: s_or_b32 s22, s22, s23 -; SI-NEXT: v_mov_b32_e32 v4, s22 -; SI-NEXT: s_lshl_b32 s22, s60, 16 -; SI-NEXT: s_and_b32 s20, s20, 0xffff -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_lshl_b32 s22, s60, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s20 ; SI-NEXT: s_and_b32 s20, s21, 0xffff ; SI-NEXT: s_lshl_b32 s21, s92, 16 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 ; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -22381,6 +22374,8 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 ; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v8, v0, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: v_or_b32_e32 v9, v0, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 @@ -22390,45 +22385,43 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v12, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v20, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v60 ; SI-NEXT: v_or_b32_e32 v21, v0, v55 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -22623,31 +22616,31 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -22678,66 +22671,66 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff ; VI-NEXT: s_lshl_b32 s17, s41, 16 ; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s16, s17, s16 ; VI-NEXT: s_and_b32 s17, s19, 0xffff ; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s17, s18, s17 ; VI-NEXT: s_and_b32 s18, s20, 0xffff ; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s15, s15, s18 ; VI-NEXT: s_and_b32 s18, s21, 0xffff ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -22842,6 +22835,8 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -22851,10 +22846,8 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -22872,24 +22865,22 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v13, s19 ; GFX9-NEXT: s_cbranch_execnz .LBB43_3 ; GFX9-NEXT: .LBB43_2: ; %cmp.true -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v38 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -22904,6 +22895,8 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -23002,81 +22995,81 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 ; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB43_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -25478,142 +25471,140 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -26312,15 +26303,15 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 @@ -26344,16 +26335,16 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 @@ -26361,21 +26352,20 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 @@ -26390,16 +26380,17 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v3, v62, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 ; SI-NEXT: v_or_b32_e32 v5, v28, v5 ; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 ; SI-NEXT: v_or_b32_e32 v11, v59, v11 ; SI-NEXT: v_or_b32_e32 v12, v57, v12 ; SI-NEXT: v_or_b32_e32 v13, v47, v13 @@ -26413,66 +26404,64 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 @@ -26486,53 +26475,55 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 @@ -26616,88 +26607,91 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v59, v46 ; SI-NEXT: v_mov_b32_e32 v46, v41 ; SI-NEXT: v_mov_b32_e32 v41, v52 ; SI-NEXT: v_mov_b32_e32 v52, v23 -; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v60, v47 ; SI-NEXT: v_mov_b32_e32 v47, v42 ; SI-NEXT: v_mov_b32_e32 v42, v53 ; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: v_mov_b32_e32 v38, v61 ; SI-NEXT: v_mov_b32_e32 v61, v56 ; SI-NEXT: v_mov_b32_e32 v56, v43 ; SI-NEXT: v_mov_b32_e32 v43, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v39, v62 ; SI-NEXT: v_mov_b32_e32 v62, v57 ; SI-NEXT: v_mov_b32_e32 v57, v44 ; SI-NEXT: v_mov_b32_e32 v44, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v33, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v63 ; SI-NEXT: v_mov_b32_e32 v63, v58 ; SI-NEXT: v_mov_b32_e32 v58, v45 ; SI-NEXT: v_mov_b32_e32 v45, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v38, v27 -; SI-NEXT: v_mov_b32_e32 v37, v28 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: v_mov_b32_e32 v32, v49 +; SI-NEXT: v_mov_b32_e32 v49, v34 +; SI-NEXT: v_mov_b32_e32 v34, v48 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v36, v28 +; SI-NEXT: v_mov_b32_e32 v35, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v28, v37 -; SI-NEXT: v_mov_b32_e32 v27, v38 -; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v29, v35 +; SI-NEXT: v_mov_b32_e32 v28, v36 +; SI-NEXT: v_mov_b32_e32 v27, v37 +; SI-NEXT: v_mov_b32_e32 v26, v48 +; SI-NEXT: v_mov_b32_e32 v48, v34 +; SI-NEXT: v_mov_b32_e32 v34, v49 +; SI-NEXT: v_mov_b32_e32 v49, v32 ; SI-NEXT: v_mov_b32_e32 v31, v40 ; SI-NEXT: v_mov_b32_e32 v40, v45 ; SI-NEXT: v_mov_b32_e32 v45, v58 ; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v33 ; SI-NEXT: v_mov_b32_e32 v25, v55 ; SI-NEXT: v_mov_b32_e32 v55, v44 ; SI-NEXT: v_mov_b32_e32 v44, v57 ; SI-NEXT: v_mov_b32_e32 v57, v62 -; SI-NEXT: v_mov_b32_e32 v62, v34 -; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v62, v39 ; SI-NEXT: v_mov_b32_e32 v24, v54 ; SI-NEXT: v_mov_b32_e32 v54, v43 ; SI-NEXT: v_mov_b32_e32 v43, v56 ; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v35 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v42, v47 -; SI-NEXT: v_mov_b32_e32 v47, v60 -; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v61, v38 ; SI-NEXT: v_mov_b32_e32 v23, v52 ; SI-NEXT: v_mov_b32_e32 v52, v41 ; SI-NEXT: v_mov_b32_e32 v41, v46 ; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v44f16_to_v11i64_scalar: @@ -26761,31 +26755,31 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -26806,13 +26800,13 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -26945,6 +26939,8 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -26954,10 +26950,8 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -26983,9 +26977,9 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 @@ -27107,81 +27101,81 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 ; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB47_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -27506,8 +27500,8 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 ; SI-NEXT: v_or_b32_e32 v1, v1, v49 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -28066,8 +28060,8 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v13, s24 ; SI-NEXT: v_mov_b32_e32 v14, s25 ; SI-NEXT: v_mov_b32_e32 v11, s26 -; SI-NEXT: v_mov_b32_e32 v12, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v12, s27 ; SI-NEXT: v_mov_b32_e32 v9, s28 ; SI-NEXT: v_mov_b32_e32 v10, s29 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 @@ -28076,24 +28070,24 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 ; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 16 ; SI-NEXT: v_lshr_b64 v[26:27], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[11:12], 16 ; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[19:20], 16 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v22 -; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 @@ -28101,102 +28095,102 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; SI-NEXT: v_lshr_b64 v[23:24], v[7:8], 16 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 16 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_lshr_b64 v[26:27], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[31:32], v[11:12], 16 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_lshr_b64 v[28:29], v[11:12], 16 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_lshr_b64 v[30:31], v[15:16], 16 ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[21:22], 16 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v22 +; SI-NEXT: v_lshr_b64 v[31:32], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v34 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v21, v21, v30 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_or_b32_e32 v21, v21, v33 ; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v34 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v52 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v31 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 ; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen @@ -28208,7 +28202,7 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen @@ -28220,7 +28214,7 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -28232,7 +28226,7 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -28244,7 +28238,7 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -28256,56 +28250,56 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v11f64_to_v44i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_mov_b32_e32 v21, s16 -; VI-NEXT: v_mov_b32_e32 v22, s17 -; VI-NEXT: v_mov_b32_e32 v19, s18 -; VI-NEXT: v_mov_b32_e32 v20, s19 -; VI-NEXT: v_mov_b32_e32 v17, s20 -; VI-NEXT: v_mov_b32_e32 v18, s21 -; VI-NEXT: v_mov_b32_e32 v11, s22 -; VI-NEXT: v_mov_b32_e32 v12, s23 -; VI-NEXT: v_mov_b32_e32 v9, s24 -; VI-NEXT: v_mov_b32_e32 v10, s25 -; VI-NEXT: v_mov_b32_e32 v15, s26 -; VI-NEXT: v_mov_b32_e32 v16, s27 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v23, s17 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v18, s20 +; VI-NEXT: v_mov_b32_e32 v19, s21 +; VI-NEXT: v_mov_b32_e32 v16, s22 +; VI-NEXT: v_mov_b32_e32 v17, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v13, s28 -; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 ; VI-NEXT: s_cbranch_scc0 .LBB49_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -28316,33 +28310,33 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; VI-NEXT: s_cbranch_execnz .LBB49_3 ; VI-NEXT: .LBB49_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 @@ -28351,54 +28345,54 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; VI-NEXT: .LBB49_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_or_b32_sdwa v24, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v25 -; VI-NEXT: v_or_b32_sdwa v25, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 -; VI-NEXT: v_or_b32_sdwa v22, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 -; VI-NEXT: v_or_b32_sdwa v23, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v38 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 -; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v8, v12, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 +; VI-NEXT: v_or_b32_sdwa v26, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; VI-NEXT: v_or_b32_sdwa v10, v10, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 +; VI-NEXT: v_or_b32_sdwa v27, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; VI-NEXT: v_or_b32_sdwa v28, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_or_b32_sdwa v12, v14, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v49 +; VI-NEXT: v_or_b32_sdwa v29, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; VI-NEXT: v_or_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v38 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; VI-NEXT: v_or_b32_sdwa v10, v15, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; VI-NEXT: v_or_b32_sdwa v22, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; VI-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; VI-NEXT: v_or_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -28427,9 +28421,9 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr8 ; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -28448,21 +28442,21 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_mov_b32_e32 v21, s16 -; GFX9-NEXT: v_mov_b32_e32 v22, s17 -; GFX9-NEXT: v_mov_b32_e32 v19, s18 -; GFX9-NEXT: v_mov_b32_e32 v20, s19 -; GFX9-NEXT: v_mov_b32_e32 v17, s20 -; GFX9-NEXT: v_mov_b32_e32 v18, s21 -; GFX9-NEXT: v_mov_b32_e32 v11, s22 -; GFX9-NEXT: v_mov_b32_e32 v12, s23 -; GFX9-NEXT: v_mov_b32_e32 v9, s24 -; GFX9-NEXT: v_mov_b32_e32 v10, s25 -; GFX9-NEXT: v_mov_b32_e32 v15, s26 -; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v23, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v21, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_mov_b32_e32 v19, s21 +; GFX9-NEXT: v_mov_b32_e32 v16, s22 +; GFX9-NEXT: v_mov_b32_e32 v17, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v13, s28 -; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s28 +; GFX9-NEXT: v_mov_b32_e32 v15, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -28473,33 +28467,33 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; GFX9-NEXT: s_cbranch_execnz .LBB49_3 ; GFX9-NEXT: .LBB49_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 @@ -28508,60 +28502,60 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; GFX9-NEXT: .LBB49_3: ; %end -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v21 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v12 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v14 ; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v35, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 ; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v19 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v21 ; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GFX9-NEXT: v_lshl_or_b32 v9, v50, 16, v9 ; GFX9-NEXT: v_lshl_or_b32 v10, v49, 16, v10 ; GFX9-NEXT: v_lshl_or_b32 v11, v48, 16, v11 ; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 @@ -28584,9 +28578,9 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr8 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 @@ -29907,6 +29901,8 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 ; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v8, v0, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: v_or_b32_e32 v9, v0, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 @@ -29916,45 +29912,43 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v12, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v14, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v16, v0, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v17, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v18, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v19, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v20, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v60 ; SI-NEXT: v_or_b32_e32 v21, v0, v55 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -30149,31 +30143,31 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -30204,66 +30198,66 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v38 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff ; VI-NEXT: s_lshl_b32 s17, s41, 16 ; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s16, s17, s16 ; VI-NEXT: s_and_b32 s17, s19, 0xffff ; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s17, s18, s17 ; VI-NEXT: s_and_b32 s18, s20, 0xffff ; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s15, s15, s18 ; VI-NEXT: s_and_b32 s18, s21, 0xffff ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 @@ -30368,6 +30362,8 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -30377,10 +30373,8 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -30398,24 +30392,22 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v13, s19 ; GFX9-NEXT: s_cbranch_execnz .LBB51_3 ; GFX9-NEXT: .LBB51_2: ; %cmp.true -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v39 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v38 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v37 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v36 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v35 ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v55, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v54, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v51, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v21 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -30430,6 +30422,8 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -30528,81 +30522,81 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 ; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB51_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -31682,8 +31676,8 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: v_mov_b32_e32 v11, s24 ; SI-NEXT: v_mov_b32_e32 v12, s25 ; SI-NEXT: v_mov_b32_e32 v13, s26 -; SI-NEXT: v_mov_b32_e32 v14, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v14, s27 ; SI-NEXT: v_mov_b32_e32 v9, s28 ; SI-NEXT: v_mov_b32_e32 v10, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -31737,9 +31731,9 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v57, v23 @@ -31772,7 +31766,7 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v20 @@ -31798,8 +31792,8 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 @@ -31831,7 +31825,7 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v20 @@ -31854,8 +31848,8 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 @@ -31891,14 +31885,14 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -31906,7 +31900,7 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -32053,10 +32047,10 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 @@ -32094,21 +32088,21 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; VI-NEXT: v_mov_b32_e32 v21, s16 -; VI-NEXT: v_mov_b32_e32 v22, s17 -; VI-NEXT: v_mov_b32_e32 v19, s18 -; VI-NEXT: v_mov_b32_e32 v20, s19 -; VI-NEXT: v_mov_b32_e32 v17, s20 -; VI-NEXT: v_mov_b32_e32 v18, s21 -; VI-NEXT: v_mov_b32_e32 v11, s22 -; VI-NEXT: v_mov_b32_e32 v12, s23 -; VI-NEXT: v_mov_b32_e32 v9, s24 -; VI-NEXT: v_mov_b32_e32 v10, s25 -; VI-NEXT: v_mov_b32_e32 v15, s26 -; VI-NEXT: v_mov_b32_e32 v16, s27 +; VI-NEXT: v_mov_b32_e32 v22, s16 +; VI-NEXT: v_mov_b32_e32 v23, s17 +; VI-NEXT: v_mov_b32_e32 v20, s18 +; VI-NEXT: v_mov_b32_e32 v21, s19 +; VI-NEXT: v_mov_b32_e32 v18, s20 +; VI-NEXT: v_mov_b32_e32 v19, s21 +; VI-NEXT: v_mov_b32_e32 v16, s22 +; VI-NEXT: v_mov_b32_e32 v17, s23 +; VI-NEXT: v_mov_b32_e32 v12, s24 +; VI-NEXT: v_mov_b32_e32 v13, s25 +; VI-NEXT: v_mov_b32_e32 v10, s26 +; VI-NEXT: v_mov_b32_e32 v11, s27 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v13, s28 -; VI-NEXT: v_mov_b32_e32 v14, s29 +; VI-NEXT: v_mov_b32_e32 v14, s28 +; VI-NEXT: v_mov_b32_e32 v15, s29 ; VI-NEXT: s_cbranch_scc0 .LBB53_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -32119,33 +32113,33 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; VI-NEXT: s_cbranch_execnz .LBB53_3 ; VI-NEXT: .LBB53_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 @@ -32154,54 +32148,54 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; VI-NEXT: .LBB53_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_or_b32_sdwa v24, v21, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v25 -; VI-NEXT: v_or_b32_sdwa v25, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 -; VI-NEXT: v_or_b32_sdwa v22, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v23 -; VI-NEXT: v_or_b32_sdwa v23, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; VI-NEXT: v_or_b32_sdwa v24, v22, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v25 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v38 -; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 -; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v26 +; VI-NEXT: v_or_b32_sdwa v8, v12, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 +; VI-NEXT: v_or_b32_sdwa v26, v20, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; VI-NEXT: v_or_b32_sdwa v10, v10, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 +; VI-NEXT: v_or_b32_sdwa v27, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v28 +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; VI-NEXT: v_or_b32_sdwa v28, v18, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_or_b32_sdwa v12, v14, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v37 -; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v49 +; VI-NEXT: v_or_b32_sdwa v29, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; VI-NEXT: v_or_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v38 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 -; VI-NEXT: v_or_b32_sdwa v10, v15, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 +; VI-NEXT: v_or_b32_sdwa v22, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; VI-NEXT: v_or_b32_sdwa v13, v15, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v35 -; VI-NEXT: v_or_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -32230,9 +32224,9 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr8 ; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr9 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -32251,21 +32245,21 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_mov_b32_e32 v21, s16 -; GFX9-NEXT: v_mov_b32_e32 v22, s17 -; GFX9-NEXT: v_mov_b32_e32 v19, s18 -; GFX9-NEXT: v_mov_b32_e32 v20, s19 -; GFX9-NEXT: v_mov_b32_e32 v17, s20 -; GFX9-NEXT: v_mov_b32_e32 v18, s21 -; GFX9-NEXT: v_mov_b32_e32 v11, s22 -; GFX9-NEXT: v_mov_b32_e32 v12, s23 -; GFX9-NEXT: v_mov_b32_e32 v9, s24 -; GFX9-NEXT: v_mov_b32_e32 v10, s25 -; GFX9-NEXT: v_mov_b32_e32 v15, s26 -; GFX9-NEXT: v_mov_b32_e32 v16, s27 +; GFX9-NEXT: v_mov_b32_e32 v22, s16 +; GFX9-NEXT: v_mov_b32_e32 v23, s17 +; GFX9-NEXT: v_mov_b32_e32 v20, s18 +; GFX9-NEXT: v_mov_b32_e32 v21, s19 +; GFX9-NEXT: v_mov_b32_e32 v18, s20 +; GFX9-NEXT: v_mov_b32_e32 v19, s21 +; GFX9-NEXT: v_mov_b32_e32 v16, s22 +; GFX9-NEXT: v_mov_b32_e32 v17, s23 +; GFX9-NEXT: v_mov_b32_e32 v12, s24 +; GFX9-NEXT: v_mov_b32_e32 v13, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v13, s28 -; GFX9-NEXT: v_mov_b32_e32 v14, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s28 +; GFX9-NEXT: v_mov_b32_e32 v15, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 @@ -32276,33 +32270,33 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; GFX9-NEXT: s_cbranch_execnz .LBB53_3 ; GFX9-NEXT: .LBB53_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 @@ -32311,60 +32305,60 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v22 ; GFX9-NEXT: .LBB53_3: ; %end -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v21 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v12 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v24, v24, 16, v22 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v12 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v14 ; GFX9-NEXT: v_lshl_or_b32 v14, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v22 +; GFX9-NEXT: v_lshl_or_b32 v22, v51, 16, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v36, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v23, v23, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v23, v50, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v35, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v19 ; GFX9-NEXT: v_lshl_or_b32 v17, v34, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v19 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v18 ; GFX9-NEXT: v_lshl_or_b32 v18, v33, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v19 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v21 ; GFX9-NEXT: v_lshl_or_b32 v19, v32, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v21 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v20 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_lshl_or_b32 v20, v31, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GFX9-NEXT: v_lshl_or_b32 v9, v50, 16, v9 ; GFX9-NEXT: v_lshl_or_b32 v10, v49, 16, v10 ; GFX9-NEXT: v_lshl_or_b32 v11, v48, 16, v11 ; GFX9-NEXT: v_lshl_or_b32 v12, v39, 16, v12 @@ -32387,9 +32381,9 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr23 -; GFX9-NEXT: ; implicit-def: $vgpr8 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr9 ; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr39 @@ -32974,142 +32968,140 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v32 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v60 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v52 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -33808,15 +33800,15 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 @@ -33840,16 +33832,16 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v63, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v27, s29 @@ -33857,21 +33849,20 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 @@ -33886,16 +33877,17 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v1, v34, v1 -; SI-NEXT: v_or_b32_e32 v3, v62, v3 -; SI-NEXT: v_or_b32_e32 v4, v36, v4 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v2 +; SI-NEXT: v_or_b32_e32 v3, v49, v3 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 ; SI-NEXT: v_or_b32_e32 v5, v28, v5 ; SI-NEXT: v_or_b32_e32 v6, v26, v6 -; SI-NEXT: v_or_b32_e32 v7, v48, v7 +; SI-NEXT: v_or_b32_e32 v7, v60, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 -; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_or_b32_e32 v10, v33, v10 ; SI-NEXT: v_or_b32_e32 v11, v59, v11 ; SI-NEXT: v_or_b32_e32 v12, v57, v12 ; SI-NEXT: v_or_b32_e32 v13, v47, v13 @@ -33909,66 +33901,64 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v48 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v26 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v60 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v33 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v47 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v44 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v53 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 @@ -33982,53 +33972,55 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v62 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v61 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v28 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v27 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 @@ -34112,88 +34104,91 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v59, v46 ; SI-NEXT: v_mov_b32_e32 v46, v41 ; SI-NEXT: v_mov_b32_e32 v41, v52 ; SI-NEXT: v_mov_b32_e32 v52, v23 -; SI-NEXT: v_mov_b32_e32 v48, v60 +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v60, v47 ; SI-NEXT: v_mov_b32_e32 v47, v42 ; SI-NEXT: v_mov_b32_e32 v42, v53 ; SI-NEXT: v_mov_b32_e32 v53, v22 -; SI-NEXT: v_mov_b32_e32 v35, v61 +; SI-NEXT: v_mov_b32_e32 v38, v61 ; SI-NEXT: v_mov_b32_e32 v61, v56 ; SI-NEXT: v_mov_b32_e32 v56, v43 ; SI-NEXT: v_mov_b32_e32 v43, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 -; SI-NEXT: v_mov_b32_e32 v50, v34 -; SI-NEXT: v_mov_b32_e32 v34, v62 +; SI-NEXT: v_mov_b32_e32 v39, v62 ; SI-NEXT: v_mov_b32_e32 v62, v57 ; SI-NEXT: v_mov_b32_e32 v57, v44 ; SI-NEXT: v_mov_b32_e32 v44, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v33, v51 +; SI-NEXT: v_mov_b32_e32 v51, v50 +; SI-NEXT: v_mov_b32_e32 v50, v63 ; SI-NEXT: v_mov_b32_e32 v63, v58 ; SI-NEXT: v_mov_b32_e32 v58, v45 ; SI-NEXT: v_mov_b32_e32 v45, v40 ; SI-NEXT: v_mov_b32_e32 v40, v31 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v38, v27 -; SI-NEXT: v_mov_b32_e32 v37, v28 -; SI-NEXT: v_mov_b32_e32 v49, v36 -; SI-NEXT: v_mov_b32_e32 v36, v29 +; SI-NEXT: v_mov_b32_e32 v32, v49 +; SI-NEXT: v_mov_b32_e32 v49, v34 +; SI-NEXT: v_mov_b32_e32 v34, v48 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v36, v28 +; SI-NEXT: v_mov_b32_e32 v35, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v28, v37 -; SI-NEXT: v_mov_b32_e32 v27, v38 -; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v29, v35 +; SI-NEXT: v_mov_b32_e32 v28, v36 +; SI-NEXT: v_mov_b32_e32 v27, v37 +; SI-NEXT: v_mov_b32_e32 v26, v48 +; SI-NEXT: v_mov_b32_e32 v48, v34 +; SI-NEXT: v_mov_b32_e32 v34, v49 +; SI-NEXT: v_mov_b32_e32 v49, v32 ; SI-NEXT: v_mov_b32_e32 v31, v40 ; SI-NEXT: v_mov_b32_e32 v40, v45 ; SI-NEXT: v_mov_b32_e32 v45, v58 ; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v63, v50 +; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v51, v33 ; SI-NEXT: v_mov_b32_e32 v25, v55 ; SI-NEXT: v_mov_b32_e32 v55, v44 ; SI-NEXT: v_mov_b32_e32 v44, v57 ; SI-NEXT: v_mov_b32_e32 v57, v62 -; SI-NEXT: v_mov_b32_e32 v62, v34 -; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v62, v39 ; SI-NEXT: v_mov_b32_e32 v24, v54 ; SI-NEXT: v_mov_b32_e32 v54, v43 ; SI-NEXT: v_mov_b32_e32 v43, v56 ; SI-NEXT: v_mov_b32_e32 v56, v61 -; SI-NEXT: v_mov_b32_e32 v61, v35 -; SI-NEXT: v_mov_b32_e32 v22, v53 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: v_mov_b32_e32 v42, v47 -; SI-NEXT: v_mov_b32_e32 v47, v60 -; SI-NEXT: v_mov_b32_e32 v60, v48 +; SI-NEXT: v_mov_b32_e32 v61, v38 ; SI-NEXT: v_mov_b32_e32 v23, v52 ; SI-NEXT: v_mov_b32_e32 v52, v41 ; SI-NEXT: v_mov_b32_e32 v41, v46 ; SI-NEXT: v_mov_b32_e32 v46, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v53, v42 +; SI-NEXT: v_mov_b32_e32 v42, v47 +; SI-NEXT: v_mov_b32_e32 v47, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v44f16_to_v11f64_scalar: @@ -34257,31 +34252,31 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 ; VI-NEXT: v_or_b32_sdwa v14, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v19, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -34302,13 +34297,13 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; VI-NEXT: .LBB55_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -34441,6 +34436,8 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v36 @@ -34450,10 +34447,8 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v19, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v38 ; GFX9-NEXT: v_lshl_or_b32 v20, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v21, v48, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -34479,9 +34474,9 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v55, 16, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v15, v54, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v53, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v52, 16, v17 @@ -34603,81 +34598,81 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v188, s32 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 ; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB55_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -35977,129 +35972,129 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v48, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v46, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s24 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v1 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v6 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v17 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v20 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v20 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 @@ -36117,33 +36112,33 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v37, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s27 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v39, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v46, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 @@ -36152,13 +36147,13 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v57, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: .LBB57_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -36167,78 +36162,80 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -36248,7 +36245,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -36259,7 +36256,7 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -36269,33 +36266,31 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -36352,31 +36347,31 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr61 @@ -37961,7 +37956,7 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i ; SI-LABEL: bitcast_v44f16_to_v44i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -37978,335 +37973,335 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v57, v2 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v58, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v40, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s18 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v63, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v54, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v61, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v62, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v59, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v60, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v26 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_or_b32_e32 v2, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v26, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v4, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v24, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v6, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: v_or_b32_e32 v8, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_or_b32_e32 v10, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v31 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_or_b32_e32 v31, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: v_or_b32_e32 v33, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_or_b32_e32 v35, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_or_b32_e32 v17, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_or_b32_e32 v19, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_lshr_b64 v[26:27], v[18:19], 16 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_or_b32_e32 v21, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v16, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_or_b32_e32 v40, v3, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: v_lshr_b64 v[28:29], v[20:21], 16 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v22, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v54, v1, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_or_b32_e32 v62, v5, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v18, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_or_b32_e32 v60, v3, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v14, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v58, v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 +; SI-NEXT: v_or_b32_e32 v56, v5, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v36 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v42 -; SI-NEXT: v_or_b32_e32 v10, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_or_b32_e32 v46, v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v42 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v41 -; SI-NEXT: v_or_b32_e32 v8, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v5 +; SI-NEXT: v_or_b32_e32 v44, v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_or_b32_e32 v42, v12, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_or_b32_e32 v6, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v4, v3, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v33 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v52 -; SI-NEXT: v_or_b32_e32 v2, v2, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v54, v12, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v58 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v60, v19, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_or_b32_e32 v40, v11, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_or_b32_e32 v58, v12, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 -; SI-NEXT: v_or_b32_e32 v46, v19, v9 -; SI-NEXT: v_or_b32_e32 v62, v11, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v56, v11, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v45, v12, v7 -; SI-NEXT: v_or_b32_e32 v12, v19, v3 -; SI-NEXT: v_or_b32_e32 v43, v11, v5 -; SI-NEXT: v_or_b32_e32 v11, v20, v1 -; SI-NEXT: v_lshr_b64 v[29:30], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 -; SI-NEXT: v_mov_b32_e32 v35, v12 -; SI-NEXT: v_mov_b32_e32 v33, v11 -; SI-NEXT: v_lshr_b64 v[30:31], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[19:20], v[3:4], 16 +; SI-NEXT: v_or_b32_e32 v13, v13, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_lshr_b64 v[22:23], v[34:35], 16 +; SI-NEXT: v_or_b32_e32 v11, v12, v1 +; SI-NEXT: v_mov_b32_e32 v23, v13 +; SI-NEXT: v_lshr_b64 v[13:14], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[30:31], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[5:6], 16 +; SI-NEXT: v_mov_b32_e32 v14, v11 ; SI-NEXT: v_lshr_b64 v[11:12], v[1:2], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v62 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -38317,8 +38312,8 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -38329,8 +38324,8 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -38341,7 +38336,7 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index 530ff4f30fd05..89a8df4f2e9e9 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -2252,9 +2252,9 @@ define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 @@ -2302,9 +2302,9 @@ define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB11_4 @@ -2352,9 +2352,9 @@ define inreg <24 x i32> @bitcast_v12f64_to_v24i32_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 @@ -3297,6 +3297,10 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 16 ; SI-NEXT: s_lshr_b32 s88, s5, 16 ; SI-NEXT: s_lshr_b32 s89, s7, 16 ; SI-NEXT: s_lshr_b32 s90, s9, 16 @@ -3309,36 +3313,33 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s31, s23, 16 ; SI-NEXT: s_lshr_b32 s34, s25, 16 ; SI-NEXT: s_lshr_b32 s35, s41, 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s27, s76, 16 -; SI-NEXT: s_and_b32 s29, s40, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s40, 0xffff +; SI-NEXT: s_lshl_b32 s29, s76, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v1, s27 ; SI-NEXT: s_and_b32 s27, s41, 0xffff ; SI-NEXT: s_lshl_b32 s29, s35, 16 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s74, 16 ; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: v_mov_b32_e32 v3, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: s_lshl_b32 s25, s34, 16 +; SI-NEXT: s_lshl_b32 s27, s74, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_or_b32 s24, s24, s27 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xffff +; SI-NEXT: s_lshl_b32 s25, s34, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s24 ; SI-NEXT: s_and_b32 s22, s22, 0xffff ; SI-NEXT: s_lshl_b32 s24, s72, 16 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -5200,6 +5201,13 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v48i16_to_v24i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v2 +; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill @@ -5216,15 +5224,6 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v56, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v31, v22 ; SI-NEXT: v_mov_b32_e32 v34, v20 ; SI-NEXT: v_mov_b32_e32 v35, v18 @@ -5234,85 +5233,85 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v39, v10 ; SI-NEXT: v_mov_b32_e32 v48, v8 ; SI-NEXT: v_mov_b32_e32 v50, v6 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v7, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v7, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v8, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: v_or_b32_e32 v9, v0, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v10, v0, v55 +; SI-NEXT: v_or_b32_e32 v10, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v11, v0, v62 +; SI-NEXT: v_or_b32_e32 v11, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v12, v0, v46 +; SI-NEXT: v_or_b32_e32 v12, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v13, v0, v45 +; SI-NEXT: v_or_b32_e32 v13, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v14, v0, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v15, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v15, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v16, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v17, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v18, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v19, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v20, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v21, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v22, v0, v27 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v57 ; SI-NEXT: v_or_b32_e32 v23, v0, v25 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -5323,31 +5322,33 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -5355,11 +5356,13 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -5371,12 +5374,12 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 @@ -5408,16 +5411,13 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -5426,7 +5426,6 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 ; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -5455,25 +5454,24 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: v_mov_b32_e32 v51, v39 ; SI-NEXT: v_mov_b32_e32 v39, v34 ; SI-NEXT: v_mov_b32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v52 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v52 ; SI-NEXT: v_mov_b32_e32 v52, v48 ; SI-NEXT: v_mov_b32_e32 v48, v35 ; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v53, v49 ; SI-NEXT: v_mov_b32_e32 v49, v36 ; SI-NEXT: v_mov_b32_e32 v36, v26 -; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v41, v50 ; SI-NEXT: v_mov_b32_e32 v50, v37 ; SI-NEXT: v_mov_b32_e32 v37, v24 ; SI-NEXT: v_mov_b32_e32 v33, v32 @@ -5481,26 +5479,27 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v56, v40 ; SI-NEXT: v_mov_b32_e32 v40, v38 ; SI-NEXT: v_mov_b32_e32 v38, v31 -; SI-NEXT: v_mov_b32_e32 v43, v25 +; SI-NEXT: v_mov_b32_e32 v46, v54 +; SI-NEXT: v_mov_b32_e32 v54, v25 ; SI-NEXT: v_mov_b32_e32 v44, v27 -; SI-NEXT: v_mov_b32_e32 v53, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v47, v55 +; SI-NEXT: v_mov_b32_e32 v55, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v35 +; SI-NEXT: v_mov_b32_e32 v35, v48 +; SI-NEXT: v_mov_b32_e32 v48, v52 +; SI-NEXT: v_mov_b32_e32 v52, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v30, v34 ; SI-NEXT: v_mov_b32_e32 v34, v39 ; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v55 +; SI-NEXT: v_mov_b32_e32 v55, v47 ; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v25, v43 +; SI-NEXT: v_mov_b32_e32 v25, v54 +; SI-NEXT: v_mov_b32_e32 v54, v46 ; SI-NEXT: v_mov_b32_e32 v31, v38 ; SI-NEXT: v_mov_b32_e32 v38, v40 ; SI-NEXT: v_mov_b32_e32 v40, v56 @@ -5508,11 +5507,12 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v32, v33 ; SI-NEXT: v_mov_b32_e32 v24, v37 ; SI-NEXT: v_mov_b32_e32 v37, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: v_mov_b32_e32 v28, v35 -; SI-NEXT: v_mov_b32_e32 v35, v48 -; SI-NEXT: v_mov_b32_e32 v48, v52 -; SI-NEXT: v_mov_b32_e32 v52, v47 +; SI-NEXT: v_mov_b32_e32 v50, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v26, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v49, v53 +; SI-NEXT: v_mov_b32_e32 v53, v45 ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v48i16_to_v24i32_scalar: @@ -5995,83 +5995,83 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 ; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB15_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -8702,152 +8702,150 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -9678,8 +9676,8 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -9752,41 +9750,36 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; SI-NEXT: .LBB19_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 @@ -9813,88 +9806,91 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 @@ -10514,83 +10510,83 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 ; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB19_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -12212,9 +12208,9 @@ define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg % ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 @@ -12262,9 +12258,9 @@ define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg % ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB27_4 @@ -12312,9 +12308,9 @@ define inreg <24 x float> @bitcast_v12f64_to_v24f32_scalar(<12 x double> inreg % ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 @@ -13135,177 +13131,174 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v17, s22 ; SI-NEXT: v_mov_b32_e32 v18, s23 ; SI-NEXT: v_mov_b32_e32 v15, s24 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v16, s25 ; SI-NEXT: v_mov_b32_e32 v13, s26 ; SI-NEXT: v_mov_b32_e32 v14, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v11, s28 ; SI-NEXT: v_mov_b32_e32 v12, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[11:12], 16 ; SI-NEXT: v_lshr_b64 v[26:27], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 ; SI-NEXT: v_lshr_b64 v[27:28], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 ; SI-NEXT: v_lshr_b64 v[28:29], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[19:20], 16 ; SI-NEXT: v_lshr_b64 v[29:30], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v20 +; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 -; SI-NEXT: v_lshr_b64 v[30:31], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v24 +; SI-NEXT: v_lshr_b64 v[36:37], v[23:24], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[11:12], 16 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_lshr_b64 v[26:27], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_lshr_b64 v[27:28], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshr_b64 v[28:29], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[19:20], 16 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_lshr_b64 v[29:30], v[1:2], 16 ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_lshr_b64 v[29:30], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[23:24], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v20 +; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_lshr_b64 v[31:32], v[13:14], 16 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_lshr_b64 v[32:33], v[15:16], 16 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_lshr_b64 v[34:35], v[19:20], 16 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_lshr_b64 v[35:36], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v24 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v37 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v31 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 ; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v41 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v35 ; SI-NEXT: v_or_b32_e32 v21, v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v40 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v34 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v55 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v54 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 ; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v52 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen @@ -13317,7 +13310,7 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -13329,7 +13322,7 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -13341,7 +13334,7 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -13353,7 +13346,7 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -13365,18 +13358,15 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr35 @@ -13385,20 +13375,22 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v24f32_to_v48i16_scalar: @@ -15141,6 +15133,13 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-LABEL: bitcast_v48i16_to_v24f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v2 +; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill @@ -15157,15 +15156,6 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v56, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v31, v22 ; SI-NEXT: v_mov_b32_e32 v34, v20 ; SI-NEXT: v_mov_b32_e32 v35, v18 @@ -15175,85 +15165,85 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v39, v10 ; SI-NEXT: v_mov_b32_e32 v48, v8 ; SI-NEXT: v_mov_b32_e32 v50, v6 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v7, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v7, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v8, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: v_or_b32_e32 v9, v0, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v10, v0, v55 +; SI-NEXT: v_or_b32_e32 v10, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v11, v0, v62 +; SI-NEXT: v_or_b32_e32 v11, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v12, v0, v46 +; SI-NEXT: v_or_b32_e32 v12, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v13, v0, v45 +; SI-NEXT: v_or_b32_e32 v13, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v14, v0, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v15, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v15, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v16, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v17, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v18, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v19, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v20, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v21, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v22, v0, v27 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v57 ; SI-NEXT: v_or_b32_e32 v23, v0, v25 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -15264,31 +15254,33 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -15296,11 +15288,13 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -15312,12 +15306,12 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 @@ -15349,16 +15343,13 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -15367,7 +15358,6 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 ; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -15396,25 +15386,24 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: v_mov_b32_e32 v51, v39 ; SI-NEXT: v_mov_b32_e32 v39, v34 ; SI-NEXT: v_mov_b32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v52 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v52 ; SI-NEXT: v_mov_b32_e32 v52, v48 ; SI-NEXT: v_mov_b32_e32 v48, v35 ; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v53, v49 ; SI-NEXT: v_mov_b32_e32 v49, v36 ; SI-NEXT: v_mov_b32_e32 v36, v26 -; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v41, v50 ; SI-NEXT: v_mov_b32_e32 v50, v37 ; SI-NEXT: v_mov_b32_e32 v37, v24 ; SI-NEXT: v_mov_b32_e32 v33, v32 @@ -15422,26 +15411,27 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v56, v40 ; SI-NEXT: v_mov_b32_e32 v40, v38 ; SI-NEXT: v_mov_b32_e32 v38, v31 -; SI-NEXT: v_mov_b32_e32 v43, v25 +; SI-NEXT: v_mov_b32_e32 v46, v54 +; SI-NEXT: v_mov_b32_e32 v54, v25 ; SI-NEXT: v_mov_b32_e32 v44, v27 -; SI-NEXT: v_mov_b32_e32 v53, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v47, v55 +; SI-NEXT: v_mov_b32_e32 v55, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v35 +; SI-NEXT: v_mov_b32_e32 v35, v48 +; SI-NEXT: v_mov_b32_e32 v48, v52 +; SI-NEXT: v_mov_b32_e32 v52, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v30, v34 ; SI-NEXT: v_mov_b32_e32 v34, v39 ; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v55 +; SI-NEXT: v_mov_b32_e32 v55, v47 ; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v25, v43 +; SI-NEXT: v_mov_b32_e32 v25, v54 +; SI-NEXT: v_mov_b32_e32 v54, v46 ; SI-NEXT: v_mov_b32_e32 v31, v38 ; SI-NEXT: v_mov_b32_e32 v38, v40 ; SI-NEXT: v_mov_b32_e32 v40, v56 @@ -15449,11 +15439,12 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v32, v33 ; SI-NEXT: v_mov_b32_e32 v24, v37 ; SI-NEXT: v_mov_b32_e32 v37, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: v_mov_b32_e32 v28, v35 -; SI-NEXT: v_mov_b32_e32 v35, v48 -; SI-NEXT: v_mov_b32_e32 v48, v52 -; SI-NEXT: v_mov_b32_e32 v52, v47 +; SI-NEXT: v_mov_b32_e32 v50, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v26, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v49, v53 +; SI-NEXT: v_mov_b32_e32 v53, v45 ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v48i16_to_v24f32_scalar: @@ -15936,83 +15927,83 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 ; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB31_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -17253,8 +17244,8 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, s17 ; SI-NEXT: v_mov_b32_e32 v13, s18 ; SI-NEXT: v_mov_b32_e32 v12, s19 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v62, s20 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, s20 ; SI-NEXT: v_mov_b32_e32 v59, s21 ; SI-NEXT: v_mov_b32_e32 v58, s22 ; SI-NEXT: v_mov_b32_e32 v57, s23 @@ -17263,8 +17254,8 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v17, s25 ; SI-NEXT: v_mov_b32_e32 v14, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, s28 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, s28 ; SI-NEXT: v_mov_b32_e32 v60, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -17284,7 +17275,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v62 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 @@ -17299,23 +17290,23 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v28, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v19 @@ -17324,9 +17315,9 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v17 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill @@ -17339,13 +17330,13 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v12 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v15 ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v5 @@ -17371,14 +17362,14 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v62 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v61 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v59 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v58 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v63 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v62 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v60 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 @@ -17388,25 +17379,25 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 @@ -17420,8 +17411,8 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v16 @@ -17429,17 +17420,18 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v58 @@ -17447,17 +17439,17 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 ; SI-NEXT: v_mov_b32_e32 v21, v10 ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -17469,10 +17461,10 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: .LBB33_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -17490,7 +17482,7 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -17517,28 +17509,28 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -17546,14 +17538,14 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -17681,12 +17673,12 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; kill: killed $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr54 @@ -17695,16 +17687,16 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr30 @@ -18735,152 +18727,150 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -19711,8 +19701,8 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -19785,41 +19775,36 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; SI-NEXT: .LBB35_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 @@ -19846,88 +19831,91 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 @@ -20547,83 +20535,83 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 ; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB35_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -21447,9 +21435,9 @@ define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 @@ -21497,9 +21485,9 @@ define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB39_4 @@ -21547,9 +21535,9 @@ define inreg <12 x i64> @bitcast_v12f64_to_v12i64_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 @@ -22521,31 +22509,32 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[74:75], s[24:25], 16 ; SI-NEXT: s_lshr_b64 s[76:77], s[40:41], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_lshl_b32 s27, s76, 16 -; SI-NEXT: s_and_b32 s29, s40, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s40, 0xffff +; SI-NEXT: s_lshl_b32 s29, s76, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v1, s27 ; SI-NEXT: s_and_b32 s27, s41, 0xffff ; SI-NEXT: s_lshl_b32 s29, s35, 16 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s74, 16 ; SI-NEXT: s_and_b32 s24, s24, 0xffff -; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: v_mov_b32_e32 v3, s24 -; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: s_lshl_b32 s25, s34, 16 +; SI-NEXT: s_lshl_b32 s27, s74, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: s_or_b32 s24, s24, s27 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xffff +; SI-NEXT: s_lshl_b32 s25, s34, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s24 ; SI-NEXT: s_and_b32 s22, s22, 0xffff ; SI-NEXT: s_lshl_b32 s24, s72, 16 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -24407,6 +24396,13 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-LABEL: bitcast_v48i16_to_v12i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v2 +; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill @@ -24423,15 +24419,6 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v56, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v31, v22 ; SI-NEXT: v_mov_b32_e32 v34, v20 ; SI-NEXT: v_mov_b32_e32 v35, v18 @@ -24441,85 +24428,85 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v39, v10 ; SI-NEXT: v_mov_b32_e32 v48, v8 ; SI-NEXT: v_mov_b32_e32 v50, v6 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v7, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v7, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v8, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: v_or_b32_e32 v9, v0, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v10, v0, v55 +; SI-NEXT: v_or_b32_e32 v10, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v11, v0, v62 +; SI-NEXT: v_or_b32_e32 v11, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v12, v0, v46 +; SI-NEXT: v_or_b32_e32 v12, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v13, v0, v45 +; SI-NEXT: v_or_b32_e32 v13, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v14, v0, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v15, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v15, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v16, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v17, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v18, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v19, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v20, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v21, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v22, v0, v27 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v57 ; SI-NEXT: v_or_b32_e32 v23, v0, v25 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -24530,31 +24517,33 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -24562,11 +24551,13 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -24578,12 +24569,12 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 @@ -24615,16 +24606,13 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -24633,7 +24621,6 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 ; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -24662,25 +24649,24 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: v_mov_b32_e32 v51, v39 ; SI-NEXT: v_mov_b32_e32 v39, v34 ; SI-NEXT: v_mov_b32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v52 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v52 ; SI-NEXT: v_mov_b32_e32 v52, v48 ; SI-NEXT: v_mov_b32_e32 v48, v35 ; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v53, v49 ; SI-NEXT: v_mov_b32_e32 v49, v36 ; SI-NEXT: v_mov_b32_e32 v36, v26 -; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v41, v50 ; SI-NEXT: v_mov_b32_e32 v50, v37 ; SI-NEXT: v_mov_b32_e32 v37, v24 ; SI-NEXT: v_mov_b32_e32 v33, v32 @@ -24688,26 +24674,27 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v56, v40 ; SI-NEXT: v_mov_b32_e32 v40, v38 ; SI-NEXT: v_mov_b32_e32 v38, v31 -; SI-NEXT: v_mov_b32_e32 v43, v25 +; SI-NEXT: v_mov_b32_e32 v46, v54 +; SI-NEXT: v_mov_b32_e32 v54, v25 ; SI-NEXT: v_mov_b32_e32 v44, v27 -; SI-NEXT: v_mov_b32_e32 v53, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v47, v55 +; SI-NEXT: v_mov_b32_e32 v55, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v35 +; SI-NEXT: v_mov_b32_e32 v35, v48 +; SI-NEXT: v_mov_b32_e32 v48, v52 +; SI-NEXT: v_mov_b32_e32 v52, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v30, v34 ; SI-NEXT: v_mov_b32_e32 v34, v39 ; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v55 +; SI-NEXT: v_mov_b32_e32 v55, v47 ; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v25, v43 +; SI-NEXT: v_mov_b32_e32 v25, v54 +; SI-NEXT: v_mov_b32_e32 v54, v46 ; SI-NEXT: v_mov_b32_e32 v31, v38 ; SI-NEXT: v_mov_b32_e32 v38, v40 ; SI-NEXT: v_mov_b32_e32 v40, v56 @@ -24715,11 +24702,12 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v32, v33 ; SI-NEXT: v_mov_b32_e32 v24, v37 ; SI-NEXT: v_mov_b32_e32 v37, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: v_mov_b32_e32 v28, v35 -; SI-NEXT: v_mov_b32_e32 v35, v48 -; SI-NEXT: v_mov_b32_e32 v48, v52 -; SI-NEXT: v_mov_b32_e32 v52, v47 +; SI-NEXT: v_mov_b32_e32 v50, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v26, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v49, v53 +; SI-NEXT: v_mov_b32_e32 v53, v45 ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v48i16_to_v12i64_scalar: @@ -25202,83 +25190,83 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 ; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB43_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -27921,152 +27909,150 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -28897,8 +28883,8 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -28971,41 +28957,36 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; SI-NEXT: .LBB47_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 @@ -29032,88 +29013,91 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 @@ -29733,83 +29717,83 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 ; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB47_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -30758,165 +30742,162 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v17, s22 ; SI-NEXT: v_mov_b32_e32 v18, s23 ; SI-NEXT: v_mov_b32_e32 v15, s24 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v16, s25 ; SI-NEXT: v_mov_b32_e32 v13, s26 ; SI-NEXT: v_mov_b32_e32 v14, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v11, s28 ; SI-NEXT: v_mov_b32_e32 v12, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[11:12], 16 ; SI-NEXT: v_lshr_b64 v[26:27], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 ; SI-NEXT: v_lshr_b64 v[27:28], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 ; SI-NEXT: v_lshr_b64 v[28:29], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[19:20], 16 ; SI-NEXT: v_lshr_b64 v[29:30], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v20 +; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 -; SI-NEXT: v_lshr_b64 v[30:31], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v24 +; SI-NEXT: v_lshr_b64 v[36:37], v[23:24], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[11:12], 16 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; SI-NEXT: v_lshr_b64 v[26:27], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; SI-NEXT: v_lshr_b64 v[27:28], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_lshr_b64 v[28:29], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[19:20], 16 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_lshr_b64 v[29:30], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[30:31], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[23:24], 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v20 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_lshr_b64 v[31:32], v[13:14], 16 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_lshr_b64 v[32:33], v[15:16], 16 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshr_b64 v[34:35], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v22 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v24 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v37 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v31 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_or_b32_e32 v23, v23, v36 ; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v41 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v35 ; SI-NEXT: v_or_b32_e32 v21, v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v40 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v34 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v55 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v54 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 ; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v52 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen @@ -30928,7 +30909,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -30940,7 +30921,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -30952,7 +30933,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -30964,7 +30945,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -30976,18 +30957,15 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr35 @@ -30996,20 +30974,22 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v12f64_to_v48i16_scalar: @@ -31018,17 +30998,17 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: v_mov_b32_e32 v23, s16 ; VI-NEXT: v_mov_b32_e32 v24, s17 -; VI-NEXT: v_mov_b32_e32 v19, s18 -; VI-NEXT: v_mov_b32_e32 v20, s19 -; VI-NEXT: v_mov_b32_e32 v13, s20 -; VI-NEXT: v_mov_b32_e32 v14, s21 -; VI-NEXT: v_mov_b32_e32 v11, s22 -; VI-NEXT: v_mov_b32_e32 v12, s23 -; VI-NEXT: v_mov_b32_e32 v21, s24 -; VI-NEXT: v_mov_b32_e32 v22, s25 -; VI-NEXT: v_mov_b32_e32 v17, s26 -; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: v_mov_b32_e32 v21, s18 +; VI-NEXT: v_mov_b32_e32 v22, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v14, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v19, s26 +; VI-NEXT: v_mov_b32_e32 v20, s27 ; VI-NEXT: v_mov_b32_e32 v15, s28 ; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: s_cbranch_scc0 .LBB49_4 @@ -31045,16 +31025,16 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; VI-NEXT: s_cbranch_execnz .LBB49_3 @@ -31065,11 +31045,11 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 @@ -31083,16 +31063,16 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; VI-NEXT: .LBB49_3: ; %end @@ -31101,42 +31081,42 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 ; VI-NEXT: v_or_b32_sdwa v31, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v19, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v55 -; VI-NEXT: v_or_b32_sdwa v24, v13, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 -; VI-NEXT: v_or_b32_sdwa v25, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; VI-NEXT: v_or_b32_sdwa v32, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; VI-NEXT: v_or_b32_sdwa v24, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 -; VI-NEXT: v_or_b32_sdwa v27, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 ; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; VI-NEXT: v_or_b32_sdwa v28, v21, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 -; VI-NEXT: v_or_b32_sdwa v29, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v17, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v10, v19, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 ; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; VI-NEXT: v_or_b32_sdwa v11, v20, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 ; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -31188,17 +31168,17 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: v_mov_b32_e32 v23, s16 ; GFX9-NEXT: v_mov_b32_e32 v24, s17 -; GFX9-NEXT: v_mov_b32_e32 v19, s18 -; GFX9-NEXT: v_mov_b32_e32 v20, s19 -; GFX9-NEXT: v_mov_b32_e32 v13, s20 -; GFX9-NEXT: v_mov_b32_e32 v14, s21 -; GFX9-NEXT: v_mov_b32_e32 v11, s22 -; GFX9-NEXT: v_mov_b32_e32 v12, s23 -; GFX9-NEXT: v_mov_b32_e32 v21, s24 -; GFX9-NEXT: v_mov_b32_e32 v22, s25 -; GFX9-NEXT: v_mov_b32_e32 v17, s26 -; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: v_mov_b32_e32 v21, s18 +; GFX9-NEXT: v_mov_b32_e32 v22, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v14, s23 +; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: v_mov_b32_e32 v12, s25 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v19, s26 +; GFX9-NEXT: v_mov_b32_e32 v20, s27 ; GFX9-NEXT: v_mov_b32_e32 v15, s28 ; GFX9-NEXT: v_mov_b32_e32 v16, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB49_4 @@ -31215,16 +31195,16 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; GFX9-NEXT: s_cbranch_execnz .LBB49_3 @@ -31235,11 +31215,11 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 @@ -31253,58 +31233,58 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; GFX9-NEXT: .LBB49_3: ; %end -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v13 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GFX9-NEXT: v_lshl_or_b32 v10, v10, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v23 @@ -32728,6 +32708,13 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-LABEL: bitcast_v48i16_to_v12f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v4 +; SI-NEXT: v_mov_b32_e32 v52, v2 +; SI-NEXT: v_mov_b32_e32 v53, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill @@ -32744,15 +32731,6 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v56, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 ; SI-NEXT: v_mov_b32_e32 v31, v22 ; SI-NEXT: v_mov_b32_e32 v34, v20 ; SI-NEXT: v_mov_b32_e32 v35, v18 @@ -32762,85 +32740,85 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v39, v10 ; SI-NEXT: v_mov_b32_e32 v48, v8 ; SI-NEXT: v_mov_b32_e32 v50, v6 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v7, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_or_b32_e32 v7, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v8, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: v_or_b32_e32 v9, v0, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v10, v0, v55 +; SI-NEXT: v_or_b32_e32 v10, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v11, v0, v62 +; SI-NEXT: v_or_b32_e32 v11, v0, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v12, v0, v46 +; SI-NEXT: v_or_b32_e32 v12, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v13, v0, v45 +; SI-NEXT: v_or_b32_e32 v13, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v14, v0, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v15, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v15, v0, v43 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v16, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v17, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v18, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v19, v0, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v20, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v21, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v22, v0, v27 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v57 ; SI-NEXT: v_or_b32_e32 v23, v0, v25 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -32851,31 +32829,33 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -32883,11 +32863,13 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -32899,12 +32881,12 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 @@ -32936,16 +32918,13 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -32954,7 +32933,6 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 ; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -32983,25 +32961,24 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v42, v51 ; SI-NEXT: v_mov_b32_e32 v51, v39 ; SI-NEXT: v_mov_b32_e32 v39, v34 ; SI-NEXT: v_mov_b32_e32 v34, v30 -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v52 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v43, v52 ; SI-NEXT: v_mov_b32_e32 v52, v48 ; SI-NEXT: v_mov_b32_e32 v48, v35 ; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v57 -; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v53, v49 ; SI-NEXT: v_mov_b32_e32 v49, v36 ; SI-NEXT: v_mov_b32_e32 v36, v26 -; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v41, v50 ; SI-NEXT: v_mov_b32_e32 v50, v37 ; SI-NEXT: v_mov_b32_e32 v37, v24 ; SI-NEXT: v_mov_b32_e32 v33, v32 @@ -33009,26 +32986,27 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v56, v40 ; SI-NEXT: v_mov_b32_e32 v40, v38 ; SI-NEXT: v_mov_b32_e32 v38, v31 -; SI-NEXT: v_mov_b32_e32 v43, v25 +; SI-NEXT: v_mov_b32_e32 v46, v54 +; SI-NEXT: v_mov_b32_e32 v54, v25 ; SI-NEXT: v_mov_b32_e32 v44, v27 -; SI-NEXT: v_mov_b32_e32 v53, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: v_mov_b32_e32 v47, v55 +; SI-NEXT: v_mov_b32_e32 v55, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v36 -; SI-NEXT: v_mov_b32_e32 v36, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v55 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v35 +; SI-NEXT: v_mov_b32_e32 v35, v48 +; SI-NEXT: v_mov_b32_e32 v48, v52 +; SI-NEXT: v_mov_b32_e32 v52, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v30, v34 ; SI-NEXT: v_mov_b32_e32 v34, v39 ; SI-NEXT: v_mov_b32_e32 v39, v51 -; SI-NEXT: v_mov_b32_e32 v51, v46 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v29, v55 +; SI-NEXT: v_mov_b32_e32 v55, v47 ; SI-NEXT: v_mov_b32_e32 v27, v44 -; SI-NEXT: v_mov_b32_e32 v25, v43 +; SI-NEXT: v_mov_b32_e32 v25, v54 +; SI-NEXT: v_mov_b32_e32 v54, v46 ; SI-NEXT: v_mov_b32_e32 v31, v38 ; SI-NEXT: v_mov_b32_e32 v38, v40 ; SI-NEXT: v_mov_b32_e32 v40, v56 @@ -33036,11 +33014,12 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v32, v33 ; SI-NEXT: v_mov_b32_e32 v24, v37 ; SI-NEXT: v_mov_b32_e32 v37, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: v_mov_b32_e32 v28, v35 -; SI-NEXT: v_mov_b32_e32 v35, v48 -; SI-NEXT: v_mov_b32_e32 v48, v52 -; SI-NEXT: v_mov_b32_e32 v52, v47 +; SI-NEXT: v_mov_b32_e32 v50, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v26, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v49, v53 +; SI-NEXT: v_mov_b32_e32 v53, v45 ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v48i16_to_v12f64_scalar: @@ -33523,83 +33502,83 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 ; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB51_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -34778,13 +34757,13 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_mov_b32_e32 v22, s19 ; SI-NEXT: v_mov_b32_e32 v19, s20 ; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_mov_b32_e32 v15, s22 -; SI-NEXT: v_mov_b32_e32 v16, s23 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 ; SI-NEXT: v_mov_b32_e32 v17, s24 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v13, s26 -; SI-NEXT: v_mov_b32_e32 v14, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_mov_b32_e32 v16, s27 ; SI-NEXT: v_mov_b32_e32 v11, s28 ; SI-NEXT: v_mov_b32_e32 v12, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -34808,7 +34787,7 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v9 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 @@ -34852,20 +34831,20 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v52, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -34881,17 +34860,17 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v13 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 @@ -34900,28 +34879,28 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 @@ -34945,17 +34924,17 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v23 @@ -34972,17 +34951,17 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_mov_b32_e32 v35, v10 ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill @@ -34995,7 +34974,7 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: .LBB53_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v32 @@ -35016,7 +34995,7 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -35043,28 +35022,28 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -35072,14 +35051,14 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -35207,12 +35186,12 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; kill: killed $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr60 @@ -35221,16 +35200,16 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr52 @@ -35263,17 +35242,17 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; VI-NEXT: v_mov_b32_e32 v23, s16 ; VI-NEXT: v_mov_b32_e32 v24, s17 -; VI-NEXT: v_mov_b32_e32 v19, s18 -; VI-NEXT: v_mov_b32_e32 v20, s19 -; VI-NEXT: v_mov_b32_e32 v13, s20 -; VI-NEXT: v_mov_b32_e32 v14, s21 -; VI-NEXT: v_mov_b32_e32 v11, s22 -; VI-NEXT: v_mov_b32_e32 v12, s23 -; VI-NEXT: v_mov_b32_e32 v21, s24 -; VI-NEXT: v_mov_b32_e32 v22, s25 -; VI-NEXT: v_mov_b32_e32 v17, s26 -; VI-NEXT: v_mov_b32_e32 v18, s27 +; VI-NEXT: v_mov_b32_e32 v21, s18 +; VI-NEXT: v_mov_b32_e32 v22, s19 +; VI-NEXT: v_mov_b32_e32 v17, s20 +; VI-NEXT: v_mov_b32_e32 v18, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v14, s23 +; VI-NEXT: v_mov_b32_e32 v11, s24 +; VI-NEXT: v_mov_b32_e32 v12, s25 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v19, s26 +; VI-NEXT: v_mov_b32_e32 v20, s27 ; VI-NEXT: v_mov_b32_e32 v15, s28 ; VI-NEXT: v_mov_b32_e32 v16, s29 ; VI-NEXT: s_cbranch_scc0 .LBB53_4 @@ -35290,16 +35269,16 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; VI-NEXT: s_cbranch_execnz .LBB53_3 @@ -35310,11 +35289,11 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 @@ -35328,16 +35307,16 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; VI-NEXT: .LBB53_3: ; %end @@ -35346,42 +35325,42 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v31 ; VI-NEXT: v_or_b32_sdwa v31, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v19, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v55 -; VI-NEXT: v_or_b32_sdwa v24, v13, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 -; VI-NEXT: v_or_b32_sdwa v25, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; VI-NEXT: v_or_b32_sdwa v32, v21, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 +; VI-NEXT: v_or_b32_sdwa v24, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; VI-NEXT: v_or_b32_sdwa v25, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v11, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 -; VI-NEXT: v_or_b32_sdwa v27, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v53 ; VI-NEXT: v_or_b32_sdwa v14, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 ; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v52 ; VI-NEXT: v_or_b32_sdwa v15, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 -; VI-NEXT: v_or_b32_sdwa v28, v21, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 -; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_sdwa v13, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 -; VI-NEXT: v_or_b32_sdwa v29, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v17, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 ; VI-NEXT: v_or_b32_sdwa v17, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_sdwa v18, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v38 +; VI-NEXT: v_or_b32_sdwa v10, v19, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 ; VI-NEXT: v_or_b32_sdwa v19, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; VI-NEXT: v_or_b32_sdwa v11, v20, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 ; VI-NEXT: v_or_b32_sdwa v21, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -35433,17 +35412,17 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GFX9-NEXT: v_mov_b32_e32 v23, s16 ; GFX9-NEXT: v_mov_b32_e32 v24, s17 -; GFX9-NEXT: v_mov_b32_e32 v19, s18 -; GFX9-NEXT: v_mov_b32_e32 v20, s19 -; GFX9-NEXT: v_mov_b32_e32 v13, s20 -; GFX9-NEXT: v_mov_b32_e32 v14, s21 -; GFX9-NEXT: v_mov_b32_e32 v11, s22 -; GFX9-NEXT: v_mov_b32_e32 v12, s23 -; GFX9-NEXT: v_mov_b32_e32 v21, s24 -; GFX9-NEXT: v_mov_b32_e32 v22, s25 -; GFX9-NEXT: v_mov_b32_e32 v17, s26 -; GFX9-NEXT: v_mov_b32_e32 v18, s27 +; GFX9-NEXT: v_mov_b32_e32 v21, s18 +; GFX9-NEXT: v_mov_b32_e32 v22, s19 +; GFX9-NEXT: v_mov_b32_e32 v17, s20 +; GFX9-NEXT: v_mov_b32_e32 v18, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v14, s23 +; GFX9-NEXT: v_mov_b32_e32 v11, s24 +; GFX9-NEXT: v_mov_b32_e32 v12, s25 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v19, s26 +; GFX9-NEXT: v_mov_b32_e32 v20, s27 ; GFX9-NEXT: v_mov_b32_e32 v15, s28 ; GFX9-NEXT: v_mov_b32_e32 v16, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB53_4 @@ -35460,16 +35439,16 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; GFX9-NEXT: s_cbranch_execnz .LBB53_3 @@ -35480,11 +35459,11 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v8 @@ -35498,58 +35477,58 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v23 ; GFX9-NEXT: .LBB53_3: ; %end -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_lshl_or_b32 v14, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v13 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v24, v55, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v17 +; GFX9-NEXT: v_lshl_or_b32 v25, v25, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v48, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GFX9-NEXT: v_lshl_or_b32 v10, v10, 16, v11 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v18, v39, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v38, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX9-NEXT: v_lshl_or_b32 v10, v10, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v20, v37, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v21, v36, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v23 @@ -36237,152 +36216,150 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v49 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v34 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v44 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v41 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -37213,8 +37190,8 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -37287,41 +37264,36 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; SI-NEXT: .LBB55_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v33 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v59 @@ -37348,88 +37320,91 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v52 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v38 @@ -38049,83 +38024,83 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 ; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB55_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -39569,165 +39544,165 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v44, v41 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 ; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v44, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s18 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v59, s18 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s21 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v34, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s21 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s25 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v20 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v61, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 ; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v44, v54 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, s27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v41, vcc, 3, v41 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 -; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_add_i32_e32 v54, vcc, 3, v54 ; SI-NEXT: v_add_i32_e32 v55, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 @@ -39746,7 +39721,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 @@ -39755,35 +39730,35 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v62, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s25 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v30 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v56, v40 ; SI-NEXT: s_waitcnt expcnt(0) @@ -39792,10 +39767,10 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: .LBB57_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -39804,43 +39779,45 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -39850,8 +39827,8 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -39861,8 +39838,8 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -39872,8 +39849,8 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -39883,7 +39860,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -39894,7 +39871,7 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -39904,75 +39881,73 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -40022,32 +39997,32 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; kill: killed $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; kill: killed $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -41792,11 +41767,11 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-LABEL: bitcast_v48f16_to_v48i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill @@ -41813,420 +41788,419 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f16_f32_e32 v61, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v57, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v46, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v39 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: v_or_b32_e32 v45, v5, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v59, v3, v19 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v44, v7, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v62, v1, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: v_or_b32_e32 v57, v3, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v26 +; SI-NEXT: v_or_b32_e32 v49, v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v35, v5, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v3, v3, v13 +; SI-NEXT: v_or_b32_e32 v39, v7, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_or_b32_e32 v26, v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v33, v5, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: v_or_b32_e32 v25, v5, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v32, v1, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v37, v7, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_or_b32_e32 v36, v3, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v29, v3, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v28, v26, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v31 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v23, v5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v3, v20, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v29, v21, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v20 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v43 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v19 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v20 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_or_b32_e32 v27, v1, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v43 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v25 -; SI-NEXT: v_or_b32_e32 v44, v26, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v43 +; SI-NEXT: v_or_b32_e32 v4, v4, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v56 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v46 +; SI-NEXT: v_or_b32_e32 v8, v8, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v43 -; SI-NEXT: v_or_b32_e32 v2, v2, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v40 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v25 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v41 -; SI-NEXT: v_or_b32_e32 v4, v4, v25 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v26 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v40 -; SI-NEXT: v_or_b32_e32 v6, v6, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v56 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v45 -; SI-NEXT: v_or_b32_e32 v8, v8, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v47 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v19 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v26 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v56 -; SI-NEXT: v_or_b32_e32 v10, v10, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v47 -; SI-NEXT: v_or_b32_e32 v12, v12, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v63 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v60 -; SI-NEXT: v_or_b32_e32 v14, v14, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v25 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v25 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63 -; SI-NEXT: v_or_b32_e32 v18, v18, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v61 -; SI-NEXT: v_or_b32_e32 v22, v22, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v56 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v62 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v60 +; SI-NEXT: v_or_b32_e32 v31, v20, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v61 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v62 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v33, v19, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v57 +; SI-NEXT: v_or_b32_e32 v35, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v42 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v59 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v16, v16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v58 -; SI-NEXT: v_lshr_b64 v[50:51], v[15:16], 16 -; SI-NEXT: v_or_b32_e32 v20, v20, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v42 -; SI-NEXT: v_mov_b32_e32 v51, v29 -; SI-NEXT: v_lshr_b64 v[29:30], v[21:22], 16 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_lshr_b64 v[30:31], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 16 -; SI-NEXT: v_mov_b32_e32 v31, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[54:55], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[13:14], 16 -; SI-NEXT: v_mov_b32_e32 v55, v35 -; SI-NEXT: v_mov_b32_e32 v53, v32 -; SI-NEXT: v_mov_b32_e32 v49, v28 -; SI-NEXT: v_mov_b32_e32 v39, v27 -; SI-NEXT: v_lshr_b64 v[36:37], v[11:12], 16 -; SI-NEXT: v_mov_b32_e32 v11, v33 -; SI-NEXT: v_lshr_b64 v[34:35], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[32:33], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[3:4], 16 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v57 +; SI-NEXT: v_or_b32_e32 v16, v16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v59 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_lshr_b64 v[19:20], v[1:2], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[40:41], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v41, v26 +; SI-NEXT: v_mov_b32_e32 v55, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[32:33], 16 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_lshr_b64 v[23:24], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v13, v49 +; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v53, v36 +; SI-NEXT: v_lshr_b64 v[48:49], v[11:12], 16 +; SI-NEXT: v_mov_b32_e32 v11, v37 +; SI-NEXT: v_lshr_b64 v[38:39], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[7:8], 16 +; SI-NEXT: v_mov_b32_e32 v24, v22 +; SI-NEXT: v_lshr_b64 v[21:22], v[3:4], 16 +; SI-NEXT: v_mov_b32_e32 v22, v29 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v62 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v54 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v60 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v56 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v43 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 93690270fd797..6df3ab2cb50fd 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -2364,6 +2364,7 @@ define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -2375,7 +2376,6 @@ define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 @@ -2417,6 +2417,7 @@ define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -2428,7 +2429,6 @@ define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB11_4 @@ -2470,6 +2470,7 @@ define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -2481,7 +2482,6 @@ define inreg <26 x i32> @bitcast_v13f64_to_v26i32_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 @@ -3526,6 +3526,12 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 ; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 ; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 ; SI-NEXT: s_lshr_b32 s92, s5, 16 ; SI-NEXT: s_lshr_b32 s93, s7, 16 ; SI-NEXT: s_lshr_b32 s94, s9, 16 @@ -3539,50 +3545,47 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s38, s25, 16 ; SI-NEXT: s_lshr_b32 s39, s41, 16 ; SI-NEXT: s_lshr_b32 s48, s43, 16 -; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 -; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s27, s88, 16 -; SI-NEXT: s_and_b32 s29, s42, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s42, 0xffff +; SI-NEXT: s_lshl_b32 s29, s88, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v1, s27 ; SI-NEXT: s_and_b32 s27, s43, 0xffff ; SI-NEXT: s_lshl_b32 s29, s48, 16 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s78, 16 -; SI-NEXT: s_and_b32 s29, s40, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 +; SI-NEXT: s_and_b32 s27, s40, 0xffff +; SI-NEXT: s_lshl_b32 s29, s78, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 ; SI-NEXT: s_and_b32 s27, s41, 0xffff ; SI-NEXT: s_lshl_b32 s29, s39, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: s_lshl_b32 s27, s76, 16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 ; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s27, s76, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v2, s24 ; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; SI-NEXT: s_lshl_b32 s25, s38, 16 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s24 ; SI-NEXT: s_and_b32 s22, s22, 0xffff ; SI-NEXT: s_lshl_b32 s24, s74, 16 -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 ; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -3759,10 +3762,10 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr95 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr93 -; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr93 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v26i32_to_v52i16_scalar: @@ -5616,52 +5619,49 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v8 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v6 -; SI-NEXT: v_mov_b32_e32 v32, v4 -; SI-NEXT: v_mov_b32_e32 v34, v2 -; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v42, v6 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v62, v30 -; SI-NEXT: v_mov_b32_e32 v30, v24 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 +; SI-NEXT: v_mov_b32_e32 v31, v24 ; SI-NEXT: v_mov_b32_e32 v38, v22 ; SI-NEXT: v_mov_b32_e32 v39, v20 ; SI-NEXT: v_mov_b32_e32 v48, v18 ; SI-NEXT: v_mov_b32_e32 v49, v16 ; SI-NEXT: v_mov_b32_e32 v50, v14 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: v_mov_b32_e32 v41, v10 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 @@ -5669,65 +5669,65 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v9, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v7, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v8, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v9, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: v_or_b32_e32 v10, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v11, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v12, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v12, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v13, v0, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: v_or_b32_e32 v14, v0, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v15, v0, v55 +; SI-NEXT: v_or_b32_e32 v15, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v16, v0, v54 +; SI-NEXT: v_or_b32_e32 v16, v0, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v17, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v17, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v22, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v23, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v24, v0, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 ; SI-NEXT: v_or_b32_e32 v25, v0, v27 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -5738,74 +5738,76 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 @@ -5815,13 +5817,13 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 @@ -5832,16 +5834,13 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -5850,7 +5849,6 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 ; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -5879,87 +5877,90 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v47, v43 -; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v60, v46 +; SI-NEXT: v_mov_b32_e32 v46, v55 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_mov_b32_e32 v42, v50 ; SI-NEXT: v_mov_b32_e32 v50, v38 -; SI-NEXT: v_mov_b32_e32 v38, v62 -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v62, v56 -; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v36, v58 +; SI-NEXT: v_mov_b32_e32 v58, v44 ; SI-NEXT: v_mov_b32_e32 v44, v40 ; SI-NEXT: v_mov_b32_e32 v40, v39 ; SI-NEXT: v_mov_b32_e32 v39, v28 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v35, v59 ; SI-NEXT: v_mov_b32_e32 v59, v45 ; SI-NEXT: v_mov_b32_e32 v45, v41 ; SI-NEXT: v_mov_b32_e32 v41, v48 ; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v52 -; SI-NEXT: v_mov_b32_e32 v52, v46 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v61, v47 +; SI-NEXT: v_mov_b32_e32 v47, v43 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v49 +; SI-NEXT: v_mov_b32_e32 v49, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v56 +; SI-NEXT: v_mov_b32_e32 v56, v27 +; SI-NEXT: v_mov_b32_e32 v32, v63 ; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v27 -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v29 +; SI-NEXT: v_mov_b32_e32 v57, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v27, v57 +; SI-NEXT: v_mov_b32_e32 v29, v57 ; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: v_mov_b32_e32 v46, v52 -; SI-NEXT: v_mov_b32_e32 v52, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v27, v56 +; SI-NEXT: v_mov_b32_e32 v56, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v31, v49 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v43 +; SI-NEXT: v_mov_b32_e32 v43, v47 +; SI-NEXT: v_mov_b32_e32 v47, v61 +; SI-NEXT: v_mov_b32_e32 v61, v34 ; SI-NEXT: v_mov_b32_e32 v26, v48 ; SI-NEXT: v_mov_b32_e32 v48, v41 ; SI-NEXT: v_mov_b32_e32 v41, v45 ; SI-NEXT: v_mov_b32_e32 v45, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v35 ; SI-NEXT: v_mov_b32_e32 v28, v39 ; SI-NEXT: v_mov_b32_e32 v39, v40 ; SI-NEXT: v_mov_b32_e32 v40, v44 -; SI-NEXT: v_mov_b32_e32 v44, v56 -; SI-NEXT: v_mov_b32_e32 v56, v62 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v44, v58 +; SI-NEXT: v_mov_b32_e32 v58, v36 +; SI-NEXT: v_mov_b32_e32 v32, v37 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v38 ; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v47 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: v_mov_b32_e32 v42, v55 +; SI-NEXT: v_mov_b32_e32 v55, v46 +; SI-NEXT: v_mov_b32_e32 v46, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v52i16_to_v26i32_scalar: @@ -6015,51 +6016,51 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -6082,11 +6083,11 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -6481,61 +6482,77 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 ; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0] @@ -6544,22 +6561,6 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB15_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -9451,181 +9452,178 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -10463,24 +10461,24 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 @@ -10498,12 +10496,12 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 @@ -10523,9 +10521,9 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) @@ -10534,7 +10532,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -10559,26 +10557,26 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 ; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 @@ -10586,13 +10584,13 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 -; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v51, v44 ; SI-NEXT: v_or_b32_e32 v7, v45, v7 -; SI-NEXT: v_or_b32_e32 v8, v40, v8 +; SI-NEXT: v_or_b32_e32 v8, v41, v8 ; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v47, v11 -; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_or_b32_e32 v11, v53, v11 +; SI-NEXT: v_or_b32_e32 v12, v57, v12 ; SI-NEXT: v_or_b32_e32 v13, v52, v13 ; SI-NEXT: v_or_b32_e32 v14, v63, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 @@ -10620,26 +10618,31 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -10665,136 +10668,128 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v40 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v56 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v57 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload @@ -10912,7 +10907,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v49, v62 ; SI-NEXT: v_mov_b32_e32 v62, v27 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v51, v44 ; SI-NEXT: v_mov_b32_e32 v27, v62 ; SI-NEXT: v_mov_b32_e32 v62, v49 ; SI-NEXT: v_mov_b32_e32 v26, v61 @@ -10983,51 +10978,51 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -11048,13 +11043,13 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; VI-NEXT: .LBB19_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -11412,61 +11407,77 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 ; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1] @@ -11475,22 +11486,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB19_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -13190,6 +13185,7 @@ define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg % ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -13201,7 +13197,6 @@ define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg % ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 @@ -13243,6 +13238,7 @@ define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg % ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -13254,7 +13250,6 @@ define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg % ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB27_4 @@ -13296,6 +13291,7 @@ define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg % ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -13307,7 +13303,6 @@ define inreg <26 x float> @bitcast_v13f64_to_v26f32_scalar(<13 x double> inreg % ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 @@ -14216,22 +14211,21 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v19, s20 ; SI-NEXT: v_mov_b32_e32 v20, s21 ; SI-NEXT: v_mov_b32_e32 v21, s22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v22, s23 ; SI-NEXT: v_mov_b32_e32 v17, s24 ; SI-NEXT: v_mov_b32_e32 v18, s25 ; SI-NEXT: v_mov_b32_e32 v15, s26 ; SI-NEXT: v_mov_b32_e32 v16, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s28 ; SI-NEXT: v_mov_b32_e32 v14, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshr_b64 v[27:28], v[11:12], 16 @@ -14240,33 +14234,33 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_lshr_b64 v[30:31], v[5:6], 16 ; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 ; SI-NEXT: v_lshr_b64 v[32:33], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_lshr_b64 v[33:34], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[19:20], 16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v20 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: v_lshr_b64 v[38:39], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[25:26], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -14283,131 +14277,131 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_lshr_b64 v[30:31], v[5:6], 16 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_lshr_b64 v[32:33], v[1:2], 16 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_lshr_b64 v[32:33], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshr_b64 v[33:34], v[13:14], 16 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_lshr_b64 v[34:35], v[15:16], 16 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_lshr_b64 v[35:36], v[17:18], 16 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_lshr_b64 v[37:38], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v20 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v25, v25, v35 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v48 +; SI-NEXT: v_or_b32_e32 v25, v25, v39 ; SI-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v46 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_add_i32_e32 v26, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v48 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v38 ; SI-NEXT: v_or_b32_e32 v23, v23, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v45 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v37 ; SI-NEXT: v_or_b32_e32 v19, v19, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v35 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v42 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v41 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 ; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen @@ -14419,7 +14413,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -14431,7 +14425,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -14443,7 +14437,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -14455,7 +14449,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -14467,7 +14461,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -14479,47 +14473,46 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v26f32_to_v52i16_scalar: @@ -16451,52 +16444,49 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v8 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v6 -; SI-NEXT: v_mov_b32_e32 v32, v4 -; SI-NEXT: v_mov_b32_e32 v34, v2 -; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v42, v6 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v62, v30 -; SI-NEXT: v_mov_b32_e32 v30, v24 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 +; SI-NEXT: v_mov_b32_e32 v31, v24 ; SI-NEXT: v_mov_b32_e32 v38, v22 ; SI-NEXT: v_mov_b32_e32 v39, v20 ; SI-NEXT: v_mov_b32_e32 v48, v18 ; SI-NEXT: v_mov_b32_e32 v49, v16 ; SI-NEXT: v_mov_b32_e32 v50, v14 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: v_mov_b32_e32 v41, v10 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 @@ -16504,65 +16494,65 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v9, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v7, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v8, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v9, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: v_or_b32_e32 v10, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v11, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v12, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v12, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v13, v0, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: v_or_b32_e32 v14, v0, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v15, v0, v55 +; SI-NEXT: v_or_b32_e32 v15, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v16, v0, v54 +; SI-NEXT: v_or_b32_e32 v16, v0, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v17, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v17, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v22, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v23, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v24, v0, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 ; SI-NEXT: v_or_b32_e32 v25, v0, v27 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -16573,74 +16563,76 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 @@ -16650,13 +16642,13 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 @@ -16667,16 +16659,13 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -16685,7 +16674,6 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 ; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -16714,87 +16702,90 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v47, v43 -; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v60, v46 +; SI-NEXT: v_mov_b32_e32 v46, v55 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_mov_b32_e32 v42, v50 ; SI-NEXT: v_mov_b32_e32 v50, v38 -; SI-NEXT: v_mov_b32_e32 v38, v62 -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v62, v56 -; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v36, v58 +; SI-NEXT: v_mov_b32_e32 v58, v44 ; SI-NEXT: v_mov_b32_e32 v44, v40 ; SI-NEXT: v_mov_b32_e32 v40, v39 ; SI-NEXT: v_mov_b32_e32 v39, v28 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v35, v59 ; SI-NEXT: v_mov_b32_e32 v59, v45 ; SI-NEXT: v_mov_b32_e32 v45, v41 ; SI-NEXT: v_mov_b32_e32 v41, v48 ; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v52 -; SI-NEXT: v_mov_b32_e32 v52, v46 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v61, v47 +; SI-NEXT: v_mov_b32_e32 v47, v43 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v49 +; SI-NEXT: v_mov_b32_e32 v49, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v56 +; SI-NEXT: v_mov_b32_e32 v56, v27 +; SI-NEXT: v_mov_b32_e32 v32, v63 ; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v27 -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v29 +; SI-NEXT: v_mov_b32_e32 v57, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v27, v57 +; SI-NEXT: v_mov_b32_e32 v29, v57 ; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: v_mov_b32_e32 v46, v52 -; SI-NEXT: v_mov_b32_e32 v52, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v27, v56 +; SI-NEXT: v_mov_b32_e32 v56, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v31, v49 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v43 +; SI-NEXT: v_mov_b32_e32 v43, v47 +; SI-NEXT: v_mov_b32_e32 v47, v61 +; SI-NEXT: v_mov_b32_e32 v61, v34 ; SI-NEXT: v_mov_b32_e32 v26, v48 ; SI-NEXT: v_mov_b32_e32 v48, v41 ; SI-NEXT: v_mov_b32_e32 v41, v45 ; SI-NEXT: v_mov_b32_e32 v45, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v35 ; SI-NEXT: v_mov_b32_e32 v28, v39 ; SI-NEXT: v_mov_b32_e32 v39, v40 ; SI-NEXT: v_mov_b32_e32 v40, v44 -; SI-NEXT: v_mov_b32_e32 v44, v56 -; SI-NEXT: v_mov_b32_e32 v56, v62 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v44, v58 +; SI-NEXT: v_mov_b32_e32 v58, v36 +; SI-NEXT: v_mov_b32_e32 v32, v37 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v38 ; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v47 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: v_mov_b32_e32 v42, v55 +; SI-NEXT: v_mov_b32_e32 v55, v46 +; SI-NEXT: v_mov_b32_e32 v46, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v52i16_to_v26f32_scalar: @@ -16850,51 +16841,51 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -16917,11 +16908,11 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -17316,61 +17307,77 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 ; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0] @@ -17379,22 +17386,6 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB31_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -18764,7 +18755,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v21, s16 +; SI-NEXT: v_mov_b32_e32 v20, s16 ; SI-NEXT: v_mov_b32_e32 v18, s17 ; SI-NEXT: v_mov_b32_e32 v16, s18 ; SI-NEXT: v_mov_b32_e32 v15, s19 @@ -18776,124 +18767,124 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v26, s23 ; SI-NEXT: v_mov_b32_e32 v24, s24 ; SI-NEXT: v_mov_b32_e32 v23, s25 -; SI-NEXT: v_mov_b32_e32 v22, s26 +; SI-NEXT: v_mov_b32_e32 v21, s26 ; SI-NEXT: v_mov_b32_e32 v19, s27 ; SI-NEXT: v_mov_b32_e32 v17, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v12 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v11 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v25 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v10 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v27, v20 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v22, v8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v7 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v22, v7 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v5 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v4 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v22, v4 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v3 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v2 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v22, v2 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v13 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v22, v17 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v18 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 @@ -18903,11 +18894,11 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v7 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -18915,15 +18906,15 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v63 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v63 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 @@ -18934,27 +18925,27 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 @@ -18969,34 +18960,34 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 @@ -19007,7 +18998,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_mov_b32_e32 v35, v11 -; SI-NEXT: v_mov_b32_e32 v29, v12 +; SI-NEXT: v_mov_b32_e32 v33, v12 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill @@ -19023,7 +19014,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -19082,7 +19073,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -19113,7 +19104,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -19122,7 +19113,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -19131,7 +19122,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -19140,7 +19131,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -19209,7 +19200,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -19225,7 +19216,7 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -19249,17 +19240,17 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr47 @@ -19276,46 +19267,46 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; kill: killed $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_branch .LBB33_2 ; ; VI-LABEL: bitcast_v26f32_to_v52f16_scalar: @@ -20426,181 +20417,178 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -21438,24 +21426,24 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 @@ -21473,12 +21461,12 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 @@ -21498,9 +21486,9 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) @@ -21509,7 +21497,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -21534,26 +21522,26 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 ; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 @@ -21561,13 +21549,13 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 -; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v51, v44 ; SI-NEXT: v_or_b32_e32 v7, v45, v7 -; SI-NEXT: v_or_b32_e32 v8, v40, v8 +; SI-NEXT: v_or_b32_e32 v8, v41, v8 ; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v47, v11 -; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_or_b32_e32 v11, v53, v11 +; SI-NEXT: v_or_b32_e32 v12, v57, v12 ; SI-NEXT: v_or_b32_e32 v13, v52, v13 ; SI-NEXT: v_or_b32_e32 v14, v63, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 @@ -21595,26 +21583,31 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -21640,136 +21633,128 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v40 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v56 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v57 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload @@ -21887,7 +21872,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v49, v62 ; SI-NEXT: v_mov_b32_e32 v62, v27 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v51, v44 ; SI-NEXT: v_mov_b32_e32 v27, v62 ; SI-NEXT: v_mov_b32_e32 v62, v49 ; SI-NEXT: v_mov_b32_e32 v26, v61 @@ -21958,51 +21943,51 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -22023,13 +22008,13 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -22387,61 +22372,77 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 ; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1] @@ -22450,22 +22451,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB35_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -23325,6 +23310,7 @@ define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -23336,7 +23322,6 @@ define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 @@ -23378,6 +23363,7 @@ define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -23389,7 +23375,6 @@ define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB39_4 @@ -23431,6 +23416,7 @@ define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -23442,7 +23428,6 @@ define inreg <13 x i64> @bitcast_v13f64_to_v13i64_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 @@ -24521,43 +24506,46 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[78:79], s[40:41], 16 ; SI-NEXT: s_lshr_b64 s[88:89], s[42:43], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_lshl_b32 s27, s88, 16 -; SI-NEXT: s_and_b32 s29, s42, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s42, 0xffff +; SI-NEXT: s_lshl_b32 s29, s88, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v1, s27 ; SI-NEXT: s_and_b32 s27, s43, 0xffff ; SI-NEXT: s_lshl_b32 s29, s48, 16 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s78, 16 -; SI-NEXT: s_and_b32 s29, s40, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 +; SI-NEXT: s_and_b32 s27, s40, 0xffff +; SI-NEXT: s_lshl_b32 s29, s78, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 ; SI-NEXT: s_and_b32 s27, s41, 0xffff ; SI-NEXT: s_lshl_b32 s29, s39, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: s_lshl_b32 s27, s76, 16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 ; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_lshl_b32 s27, s76, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 ; SI-NEXT: s_or_b32 s24, s24, s27 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v5, s24 +; SI-NEXT: v_mov_b32_e32 v2, s24 ; SI-NEXT: s_and_b32 s24, s25, 0xffff -; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 ; SI-NEXT: s_lshl_b32 s25, s38, 16 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s24 ; SI-NEXT: s_and_b32 s22, s22, 0xffff ; SI-NEXT: s_lshl_b32 s24, s74, 16 -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 ; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -24734,10 +24722,10 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr95 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr93 -; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr93 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v13i64_to_v52i16_scalar: @@ -26591,52 +26579,49 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v8 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v6 -; SI-NEXT: v_mov_b32_e32 v32, v4 -; SI-NEXT: v_mov_b32_e32 v34, v2 -; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v42, v6 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v62, v30 -; SI-NEXT: v_mov_b32_e32 v30, v24 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 +; SI-NEXT: v_mov_b32_e32 v31, v24 ; SI-NEXT: v_mov_b32_e32 v38, v22 ; SI-NEXT: v_mov_b32_e32 v39, v20 ; SI-NEXT: v_mov_b32_e32 v48, v18 ; SI-NEXT: v_mov_b32_e32 v49, v16 ; SI-NEXT: v_mov_b32_e32 v50, v14 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: v_mov_b32_e32 v41, v10 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 @@ -26644,65 +26629,65 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v9, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v7, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v8, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v9, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: v_or_b32_e32 v10, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v11, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v12, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v12, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v13, v0, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: v_or_b32_e32 v14, v0, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v15, v0, v55 +; SI-NEXT: v_or_b32_e32 v15, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v16, v0, v54 +; SI-NEXT: v_or_b32_e32 v16, v0, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v17, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v17, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v22, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v23, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v24, v0, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 ; SI-NEXT: v_or_b32_e32 v25, v0, v27 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -26713,74 +26698,76 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 @@ -26790,13 +26777,13 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 @@ -26807,16 +26794,13 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -26825,7 +26809,6 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 ; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -26854,87 +26837,90 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v47, v43 -; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v60, v46 +; SI-NEXT: v_mov_b32_e32 v46, v55 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_mov_b32_e32 v42, v50 ; SI-NEXT: v_mov_b32_e32 v50, v38 -; SI-NEXT: v_mov_b32_e32 v38, v62 -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v62, v56 -; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v36, v58 +; SI-NEXT: v_mov_b32_e32 v58, v44 ; SI-NEXT: v_mov_b32_e32 v44, v40 ; SI-NEXT: v_mov_b32_e32 v40, v39 ; SI-NEXT: v_mov_b32_e32 v39, v28 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v35, v59 ; SI-NEXT: v_mov_b32_e32 v59, v45 ; SI-NEXT: v_mov_b32_e32 v45, v41 ; SI-NEXT: v_mov_b32_e32 v41, v48 ; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v52 -; SI-NEXT: v_mov_b32_e32 v52, v46 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v61, v47 +; SI-NEXT: v_mov_b32_e32 v47, v43 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v49 +; SI-NEXT: v_mov_b32_e32 v49, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v56 +; SI-NEXT: v_mov_b32_e32 v56, v27 +; SI-NEXT: v_mov_b32_e32 v32, v63 ; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v27 -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v29 +; SI-NEXT: v_mov_b32_e32 v57, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v27, v57 +; SI-NEXT: v_mov_b32_e32 v29, v57 ; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: v_mov_b32_e32 v46, v52 -; SI-NEXT: v_mov_b32_e32 v52, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v27, v56 +; SI-NEXT: v_mov_b32_e32 v56, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v31, v49 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v43 +; SI-NEXT: v_mov_b32_e32 v43, v47 +; SI-NEXT: v_mov_b32_e32 v47, v61 +; SI-NEXT: v_mov_b32_e32 v61, v34 ; SI-NEXT: v_mov_b32_e32 v26, v48 ; SI-NEXT: v_mov_b32_e32 v48, v41 ; SI-NEXT: v_mov_b32_e32 v41, v45 ; SI-NEXT: v_mov_b32_e32 v45, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v35 ; SI-NEXT: v_mov_b32_e32 v28, v39 ; SI-NEXT: v_mov_b32_e32 v39, v40 ; SI-NEXT: v_mov_b32_e32 v40, v44 -; SI-NEXT: v_mov_b32_e32 v44, v56 -; SI-NEXT: v_mov_b32_e32 v56, v62 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v44, v58 +; SI-NEXT: v_mov_b32_e32 v58, v36 +; SI-NEXT: v_mov_b32_e32 v32, v37 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v38 ; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v47 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: v_mov_b32_e32 v42, v55 +; SI-NEXT: v_mov_b32_e32 v55, v46 +; SI-NEXT: v_mov_b32_e32 v46, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v52i16_to_v13i64_scalar: @@ -26990,51 +26976,51 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -27057,11 +27043,11 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -27456,61 +27442,77 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 ; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0] @@ -27519,22 +27521,6 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB43_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -30441,181 +30427,178 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -31453,24 +31436,24 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 @@ -31488,12 +31471,12 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 @@ -31513,9 +31496,9 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) @@ -31524,7 +31507,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -31549,26 +31532,26 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 ; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 @@ -31576,13 +31559,13 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 -; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v51, v44 ; SI-NEXT: v_or_b32_e32 v7, v45, v7 -; SI-NEXT: v_or_b32_e32 v8, v40, v8 +; SI-NEXT: v_or_b32_e32 v8, v41, v8 ; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v47, v11 -; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_or_b32_e32 v11, v53, v11 +; SI-NEXT: v_or_b32_e32 v12, v57, v12 ; SI-NEXT: v_or_b32_e32 v13, v52, v13 ; SI-NEXT: v_or_b32_e32 v14, v63, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 @@ -31610,26 +31593,31 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -31655,136 +31643,128 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v40 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v56 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v57 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload @@ -31902,7 +31882,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v49, v62 ; SI-NEXT: v_mov_b32_e32 v62, v27 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v51, v44 ; SI-NEXT: v_mov_b32_e32 v27, v62 ; SI-NEXT: v_mov_b32_e32 v62, v49 ; SI-NEXT: v_mov_b32_e32 v26, v61 @@ -31973,51 +31953,51 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -32038,13 +32018,13 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -32402,61 +32382,77 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 ; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1] @@ -32465,22 +32461,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB47_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -33525,22 +33505,21 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v19, s20 ; SI-NEXT: v_mov_b32_e32 v20, s21 ; SI-NEXT: v_mov_b32_e32 v21, s22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v22, s23 ; SI-NEXT: v_mov_b32_e32 v17, s24 ; SI-NEXT: v_mov_b32_e32 v18, s25 ; SI-NEXT: v_mov_b32_e32 v15, s26 ; SI-NEXT: v_mov_b32_e32 v16, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s28 ; SI-NEXT: v_mov_b32_e32 v14, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshr_b64 v[27:28], v[11:12], 16 @@ -33549,33 +33528,33 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: v_lshr_b64 v[30:31], v[5:6], 16 ; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 ; SI-NEXT: v_lshr_b64 v[32:33], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_lshr_b64 v[33:34], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[19:20], 16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v20 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: v_lshr_b64 v[38:39], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[25:26], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 @@ -33587,123 +33566,123 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; SI-NEXT: v_lshr_b64 v[29:30], v[7:8], 16 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_lshr_b64 v[30:31], v[5:6], 16 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_lshr_b64 v[32:33], v[1:2], 16 ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_lshr_b64 v[33:34], v[13:14], 16 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_lshr_b64 v[34:35], v[15:16], 16 ; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_lshr_b64 v[35:36], v[17:18], 16 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_lshr_b64 v[32:33], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[37:38], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v18 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v20 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v25, v25, v35 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v48 +; SI-NEXT: v_or_b32_e32 v25, v25, v39 ; SI-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v46 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_add_i32_e32 v26, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v48 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v38 ; SI-NEXT: v_or_b32_e32 v23, v23, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v45 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v37 ; SI-NEXT: v_or_b32_e32 v19, v19, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v36 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v35 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v42 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v41 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 ; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen @@ -33715,7 +33694,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -33727,7 +33706,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -33739,7 +33718,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -33751,7 +33730,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -33763,7 +33742,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -33775,66 +33754,65 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v13f64_to_v52i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; VI-NEXT: v_mov_b32_e32 v21, s16 -; VI-NEXT: v_mov_b32_e32 v22, s17 -; VI-NEXT: v_mov_b32_e32 v15, s18 -; VI-NEXT: v_mov_b32_e32 v16, s19 -; VI-NEXT: v_mov_b32_e32 v13, s20 -; VI-NEXT: v_mov_b32_e32 v14, s21 -; VI-NEXT: v_mov_b32_e32 v30, s22 -; VI-NEXT: v_mov_b32_e32 v31, s23 -; VI-NEXT: v_mov_b32_e32 v23, s24 -; VI-NEXT: v_mov_b32_e32 v24, s25 -; VI-NEXT: v_mov_b32_e32 v19, s26 -; VI-NEXT: v_mov_b32_e32 v20, s27 +; VI-NEXT: v_mov_b32_e32 v23, s16 +; VI-NEXT: v_mov_b32_e32 v24, s17 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v14, s23 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v32, s24 +; VI-NEXT: v_mov_b32_e32 v33, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v22, s27 ; VI-NEXT: v_mov_b32_e32 v17, s28 ; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill @@ -33857,18 +33835,18 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 ; VI-NEXT: s_cbranch_execnz .LBB49_3 ; VI-NEXT: .LBB49_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -33878,12 +33856,12 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 ; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 @@ -33898,43 +33876,43 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 ; VI-NEXT: .LBB49_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v21, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v19, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; VI-NEXT: v_or_b32_sdwa v30, v30, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 -; VI-NEXT: v_or_b32_sdwa v31, v31, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v33, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v19, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v21, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload @@ -33988,10 +33966,10 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr12 @@ -34014,19 +33992,19 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_mov_b32_e32 v21, s16 -; GFX9-NEXT: v_mov_b32_e32 v22, s17 -; GFX9-NEXT: v_mov_b32_e32 v15, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: v_mov_b32_e32 v13, s20 -; GFX9-NEXT: v_mov_b32_e32 v14, s21 -; GFX9-NEXT: v_mov_b32_e32 v30, s22 -; GFX9-NEXT: v_mov_b32_e32 v31, s23 -; GFX9-NEXT: v_mov_b32_e32 v23, s24 -; GFX9-NEXT: v_mov_b32_e32 v24, s25 -; GFX9-NEXT: v_mov_b32_e32 v19, s26 -; GFX9-NEXT: v_mov_b32_e32 v20, s27 +; GFX9-NEXT: v_mov_b32_e32 v23, s16 +; GFX9-NEXT: v_mov_b32_e32 v24, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v14, s23 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v32, s24 +; GFX9-NEXT: v_mov_b32_e32 v33, s25 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v22, s27 ; GFX9-NEXT: v_mov_b32_e32 v17, s28 ; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill @@ -34049,18 +34027,18 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 ; GFX9-NEXT: s_cbranch_execnz .LBB49_3 ; GFX9-NEXT: .LBB49_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -34070,12 +34048,12 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 ; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 @@ -34090,34 +34068,30 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 ; GFX9-NEXT: .LBB49_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v30 -; GFX9-NEXT: v_lshl_or_b32 v30, v43, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v31 -; GFX9-NEXT: v_lshl_or_b32 v31, v42, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v32, v43, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v33, v42, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v21 ; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v17 ; GFX9-NEXT: v_lshl_or_b32 v12, v12, 16, v13 @@ -34130,30 +34104,34 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v16, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; GFX9-NEXT: v_lshl_or_b32 v17, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v18, v52, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v21 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v20, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v21, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v22, v48, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v23, v39, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 ; GFX9-NEXT: v_lshl_or_b32 v24, v38, 16, v0 @@ -34180,10 +34158,10 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr12 @@ -35721,52 +35699,49 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v47, v8 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v6 -; SI-NEXT: v_mov_b32_e32 v32, v4 -; SI-NEXT: v_mov_b32_e32 v34, v2 -; SI-NEXT: v_mov_b32_e32 v37, v0 +; SI-NEXT: v_mov_b32_e32 v53, v8 +; SI-NEXT: v_mov_b32_e32 v42, v6 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v43, v2 +; SI-NEXT: v_mov_b32_e32 v44, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v62, v30 -; SI-NEXT: v_mov_b32_e32 v30, v24 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 +; SI-NEXT: v_mov_b32_e32 v31, v24 ; SI-NEXT: v_mov_b32_e32 v38, v22 ; SI-NEXT: v_mov_b32_e32 v39, v20 ; SI-NEXT: v_mov_b32_e32 v48, v18 ; SI-NEXT: v_mov_b32_e32 v49, v16 ; SI-NEXT: v_mov_b32_e32 v50, v14 -; SI-NEXT: v_mov_b32_e32 v40, v12 -; SI-NEXT: v_mov_b32_e32 v41, v10 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v52, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 @@ -35774,65 +35749,65 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v9, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v7, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v8, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v9, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: v_or_b32_e32 v10, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v11, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v12, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v12, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v13, v0, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: v_or_b32_e32 v14, v0, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v15, v0, v55 +; SI-NEXT: v_or_b32_e32 v15, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v16, v0, v54 +; SI-NEXT: v_or_b32_e32 v16, v0, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v17, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v17, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v18, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v51 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v22, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v23, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_or_b32_e32 v24, v0, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 ; SI-NEXT: v_or_b32_e32 v25, v0, v27 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -35843,74 +35818,76 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 @@ -35920,13 +35897,13 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 @@ -35937,16 +35914,13 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: v_or_b32_e32 v0, v29, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -35955,7 +35929,6 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 ; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -35984,87 +35957,90 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v47, v43 -; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v60, v46 +; SI-NEXT: v_mov_b32_e32 v46, v55 +; SI-NEXT: v_mov_b32_e32 v55, v42 +; SI-NEXT: v_mov_b32_e32 v42, v50 ; SI-NEXT: v_mov_b32_e32 v50, v38 -; SI-NEXT: v_mov_b32_e32 v38, v62 -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v62, v56 -; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v36, v58 +; SI-NEXT: v_mov_b32_e32 v58, v44 ; SI-NEXT: v_mov_b32_e32 v44, v40 ; SI-NEXT: v_mov_b32_e32 v40, v39 ; SI-NEXT: v_mov_b32_e32 v39, v28 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v35, v59 ; SI-NEXT: v_mov_b32_e32 v59, v45 ; SI-NEXT: v_mov_b32_e32 v45, v41 ; SI-NEXT: v_mov_b32_e32 v41, v48 ; SI-NEXT: v_mov_b32_e32 v48, v26 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v52 -; SI-NEXT: v_mov_b32_e32 v52, v46 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v61, v47 +; SI-NEXT: v_mov_b32_e32 v47, v43 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mov_b32_e32 v46, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v30 -; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v43, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: v_mov_b32_e32 v51, v49 +; SI-NEXT: v_mov_b32_e32 v49, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v33, v62 +; SI-NEXT: v_mov_b32_e32 v62, v56 +; SI-NEXT: v_mov_b32_e32 v56, v27 +; SI-NEXT: v_mov_b32_e32 v32, v63 ; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v27 -; SI-NEXT: v_mov_b32_e32 v53, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v29 +; SI-NEXT: v_mov_b32_e32 v57, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v53 -; SI-NEXT: v_mov_b32_e32 v27, v57 +; SI-NEXT: v_mov_b32_e32 v29, v57 ; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v30, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: v_mov_b32_e32 v46, v52 -; SI-NEXT: v_mov_b32_e32 v52, v60 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v27, v56 +; SI-NEXT: v_mov_b32_e32 v56, v62 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v31, v49 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_mov_b32_e32 v51, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v43 +; SI-NEXT: v_mov_b32_e32 v43, v47 +; SI-NEXT: v_mov_b32_e32 v47, v61 +; SI-NEXT: v_mov_b32_e32 v61, v34 ; SI-NEXT: v_mov_b32_e32 v26, v48 ; SI-NEXT: v_mov_b32_e32 v48, v41 ; SI-NEXT: v_mov_b32_e32 v41, v45 ; SI-NEXT: v_mov_b32_e32 v45, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v35 ; SI-NEXT: v_mov_b32_e32 v28, v39 ; SI-NEXT: v_mov_b32_e32 v39, v40 ; SI-NEXT: v_mov_b32_e32 v40, v44 -; SI-NEXT: v_mov_b32_e32 v44, v56 -; SI-NEXT: v_mov_b32_e32 v56, v62 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v44, v58 +; SI-NEXT: v_mov_b32_e32 v58, v36 +; SI-NEXT: v_mov_b32_e32 v32, v37 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v38 ; SI-NEXT: v_mov_b32_e32 v38, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v47 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: v_mov_b32_e32 v42, v55 +; SI-NEXT: v_mov_b32_e32 v55, v46 +; SI-NEXT: v_mov_b32_e32 v46, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v52i16_to_v13f64_scalar: @@ -36120,51 +36096,51 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -36187,11 +36163,11 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -36586,61 +36562,77 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 ; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0] @@ -36649,22 +36641,6 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB51_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -37973,12 +37949,12 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_mov_b32_e32 v17, s20 ; SI-NEXT: v_mov_b32_e32 v18, s21 ; SI-NEXT: v_mov_b32_e32 v23, s22 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v24, s23 ; SI-NEXT: v_mov_b32_e32 v19, s24 ; SI-NEXT: v_mov_b32_e32 v20, s25 ; SI-NEXT: v_mov_b32_e32 v15, s26 ; SI-NEXT: v_mov_b32_e32 v16, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s28 ; SI-NEXT: v_mov_b32_e32 v14, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -38007,12 +37983,12 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v10 ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -38033,13 +38009,13 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 @@ -38047,7 +38023,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v24 @@ -38112,7 +38088,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: .LBB53_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 @@ -38151,17 +38127,17 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 @@ -38190,20 +38166,20 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 @@ -38215,7 +38191,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_mov_b32_e32 v43, v10 ; SI-NEXT: v_mov_b32_e32 v41, v11 -; SI-NEXT: v_mov_b32_e32 v51, v12 +; SI-NEXT: v_mov_b32_e32 v55, v12 ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill @@ -38289,7 +38265,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -38322,7 +38298,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -38331,7 +38307,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -38340,7 +38316,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -38349,7 +38325,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -38409,7 +38385,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -38432,7 +38408,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -38456,13 +38432,13 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr55 ; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr55 ; SI-NEXT: ; kill: killed $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -38483,65 +38459,65 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr55 ; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr55 ; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr55 ; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr55 ; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 +; SI-NEXT: ; kill: killed $vgpr55 ; SI-NEXT: ; kill: killed $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v13f64_to_v52f16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; VI-NEXT: v_mov_b32_e32 v21, s16 -; VI-NEXT: v_mov_b32_e32 v22, s17 -; VI-NEXT: v_mov_b32_e32 v15, s18 -; VI-NEXT: v_mov_b32_e32 v16, s19 -; VI-NEXT: v_mov_b32_e32 v13, s20 -; VI-NEXT: v_mov_b32_e32 v14, s21 -; VI-NEXT: v_mov_b32_e32 v30, s22 -; VI-NEXT: v_mov_b32_e32 v31, s23 -; VI-NEXT: v_mov_b32_e32 v23, s24 -; VI-NEXT: v_mov_b32_e32 v24, s25 -; VI-NEXT: v_mov_b32_e32 v19, s26 -; VI-NEXT: v_mov_b32_e32 v20, s27 +; VI-NEXT: v_mov_b32_e32 v23, s16 +; VI-NEXT: v_mov_b32_e32 v24, s17 +; VI-NEXT: v_mov_b32_e32 v19, s18 +; VI-NEXT: v_mov_b32_e32 v20, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s21 +; VI-NEXT: v_mov_b32_e32 v13, s22 +; VI-NEXT: v_mov_b32_e32 v14, s23 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v32, s24 +; VI-NEXT: v_mov_b32_e32 v33, s25 +; VI-NEXT: v_mov_b32_e32 v21, s26 +; VI-NEXT: v_mov_b32_e32 v22, s27 ; VI-NEXT: v_mov_b32_e32 v17, s28 ; VI-NEXT: v_mov_b32_e32 v18, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill @@ -38564,18 +38540,18 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 ; VI-NEXT: s_cbranch_execnz .LBB53_3 ; VI-NEXT: .LBB53_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -38585,12 +38561,12 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 ; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 @@ -38605,43 +38581,43 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 ; VI-NEXT: .LBB53_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v21, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; VI-NEXT: v_or_b32_sdwa v26, v15, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v27 -; VI-NEXT: v_or_b32_sdwa v27, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v37 +; VI-NEXT: v_or_b32_sdwa v37, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v26 +; VI-NEXT: v_or_b32_sdwa v26, v19, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v27 +; VI-NEXT: v_or_b32_sdwa v27, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 +; VI-NEXT: v_or_b32_sdwa v28, v15, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v43 -; VI-NEXT: v_or_b32_sdwa v30, v30, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v42 -; VI-NEXT: v_or_b32_sdwa v31, v31, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v33, v33, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v19, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v21, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v20, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v22, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v41 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v40 ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload @@ -38695,10 +38671,10 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; VI-NEXT: ; implicit-def: $vgpr27 ; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr12 @@ -38721,19 +38697,19 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_mov_b32_e32 v21, s16 -; GFX9-NEXT: v_mov_b32_e32 v22, s17 -; GFX9-NEXT: v_mov_b32_e32 v15, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: v_mov_b32_e32 v13, s20 -; GFX9-NEXT: v_mov_b32_e32 v14, s21 -; GFX9-NEXT: v_mov_b32_e32 v30, s22 -; GFX9-NEXT: v_mov_b32_e32 v31, s23 -; GFX9-NEXT: v_mov_b32_e32 v23, s24 -; GFX9-NEXT: v_mov_b32_e32 v24, s25 -; GFX9-NEXT: v_mov_b32_e32 v19, s26 -; GFX9-NEXT: v_mov_b32_e32 v20, s27 +; GFX9-NEXT: v_mov_b32_e32 v23, s16 +; GFX9-NEXT: v_mov_b32_e32 v24, s17 +; GFX9-NEXT: v_mov_b32_e32 v19, s18 +; GFX9-NEXT: v_mov_b32_e32 v20, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s21 +; GFX9-NEXT: v_mov_b32_e32 v13, s22 +; GFX9-NEXT: v_mov_b32_e32 v14, s23 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v32, s24 +; GFX9-NEXT: v_mov_b32_e32 v33, s25 +; GFX9-NEXT: v_mov_b32_e32 v21, s26 +; GFX9-NEXT: v_mov_b32_e32 v22, s27 ; GFX9-NEXT: v_mov_b32_e32 v17, s28 ; GFX9-NEXT: v_mov_b32_e32 v18, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill @@ -38756,18 +38732,18 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 ; GFX9-NEXT: s_cbranch_execnz .LBB53_3 ; GFX9-NEXT: .LBB53_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 @@ -38777,12 +38753,12 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 ; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v9 @@ -38797,34 +38773,30 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 16, v18 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 ; GFX9-NEXT: .LBB53_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v30 -; GFX9-NEXT: v_lshl_or_b32 v30, v43, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v31 -; GFX9-NEXT: v_lshl_or_b32 v31, v42, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v23 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v24 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v19 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v32 +; GFX9-NEXT: v_lshl_or_b32 v32, v43, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v33 +; GFX9-NEXT: v_lshl_or_b32 v33, v42, 16, v13 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v21 ; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v13 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v13 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v17 ; GFX9-NEXT: v_lshl_or_b32 v12, v12, 16, v13 @@ -38837,30 +38809,34 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v16, v54, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; GFX9-NEXT: v_lshl_or_b32 v17, v53, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v26, v26, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v18, v52, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v27, v27, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v51, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v21 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v20, v50, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v21, v49, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v23 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v22, v48, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v23, v39, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v10 ; GFX9-NEXT: v_lshl_or_b32 v24, v38, 16, v0 @@ -38887,10 +38863,10 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; GFX9-NEXT: ; implicit-def: $vgpr27 ; GFX9-NEXT: ; implicit-def: $vgpr28 ; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr32 -; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr12 @@ -39607,181 +39583,178 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v54 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v50 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v51 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v46 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v44 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v32 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -40619,24 +40592,24 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 @@ -40654,12 +40627,12 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v27, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 @@ -40679,9 +40652,9 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) @@ -40690,7 +40663,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -40715,26 +40688,26 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 ; SI-NEXT: v_or_b32_e32 v0, v11, v0 -; SI-NEXT: v_or_b32_e32 v2, v14, v2 +; SI-NEXT: v_or_b32_e32 v2, v12, v2 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 @@ -40742,13 +40715,13 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 -; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v51, v44 ; SI-NEXT: v_or_b32_e32 v7, v45, v7 -; SI-NEXT: v_or_b32_e32 v8, v40, v8 +; SI-NEXT: v_or_b32_e32 v8, v41, v8 ; SI-NEXT: v_or_b32_e32 v9, v55, v9 -; SI-NEXT: v_or_b32_e32 v10, v54, v10 -; SI-NEXT: v_or_b32_e32 v11, v47, v11 -; SI-NEXT: v_or_b32_e32 v12, v60, v12 +; SI-NEXT: v_or_b32_e32 v10, v47, v10 +; SI-NEXT: v_or_b32_e32 v11, v53, v11 +; SI-NEXT: v_or_b32_e32 v12, v57, v12 ; SI-NEXT: v_or_b32_e32 v13, v52, v13 ; SI-NEXT: v_or_b32_e32 v14, v63, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 @@ -40776,26 +40749,31 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v60 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 @@ -40821,136 +40799,128 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v30 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v40 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v56 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v57 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v57 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload @@ -41068,7 +41038,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_mov_b32_e32 v49, v62 ; SI-NEXT: v_mov_b32_e32 v62, v27 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: v_mov_b32_e32 v51, v44 ; SI-NEXT: v_mov_b32_e32 v27, v62 ; SI-NEXT: v_mov_b32_e32 v62, v49 ; SI-NEXT: v_mov_b32_e32 v26, v61 @@ -41139,51 +41109,51 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 ; VI-NEXT: v_or_b32_sdwa v14, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v23, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -41204,13 +41174,13 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; VI-NEXT: .LBB55_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -41568,61 +41538,77 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v191, v1 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v185, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 ; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1] @@ -41631,22 +41617,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB55_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -43240,174 +43210,187 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB57_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v51, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s25 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v57 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v19 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v45 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v25 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v32, v44 +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v28 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v14 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v57 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, v15 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v4, v16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 +; SI-NEXT: v_mov_b32_e32 v5, v17 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 +; SI-NEXT: v_mov_b32_e32 v7, v18 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v8, v19 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v47 ; SI-NEXT: s_branch .LBB57_3 ; SI-NEXT: .LBB57_2: ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v8, v19 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v7, v18 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v5, v17 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v4, v16 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v2, v15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v1, v14 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -43415,28 +43398,20 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -43476,244 +43451,234 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: .LBB57_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v62 -; SI-NEXT: v_mov_b32_e32 v62, v32 -; SI-NEXT: v_mov_b32_e32 v32, v37 -; SI-NEXT: v_mov_b32_e32 v37, v39 -; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v58, v61 +; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: v_mov_b32_e32 v31, v33 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v49, v51 ; SI-NEXT: v_mov_b32_e32 v51, v53 ; SI-NEXT: v_mov_b32_e32 v53, v55 ; SI-NEXT: v_mov_b32_e32 v55, v41 ; SI-NEXT: v_mov_b32_e32 v41, v42 ; SI-NEXT: s_cbranch_vccnz .LBB57_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v43 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v4 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v2 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s18 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s21 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s22 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 ; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s24 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 ; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s26 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 ; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 -; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s27 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, s28 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v60, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s27 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s17 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v42 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v52, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v54, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v39, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v51, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v53, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v41, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s25 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v41, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v19 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v44 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v63, v42 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v35, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v52, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v39, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v54, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v40, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v51, v21 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: .LBB57_5: ; %end +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -43721,43 +43686,47 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -43767,8 +43736,8 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -43778,7 +43747,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -43789,7 +43758,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -43800,7 +43769,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -43811,8 +43780,8 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -43822,8 +43791,8 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -43833,8 +43802,8 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -43844,30 +43813,26 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -43875,60 +43840,56 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -43936,7 +43897,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -45836,15 +45797,6 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-LABEL: bitcast_v52f16_to_v52i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill @@ -45861,482 +45813,485 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v57, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v46, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v45, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v63, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v41, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v28, v32 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v22, v34 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v39 +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v19, v51 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v56, v55 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v41 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: v_mov_b32_e32 v38, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 -; SI-NEXT: v_mov_b32_e32 v28, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v36, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v39 -; SI-NEXT: v_mov_b32_e32 v9, v15 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_mov_b32_e32 v23, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v42 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v39, v7, v19 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v38, v7, v15 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v17 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: v_mov_b32_e32 v29, v11 -; SI-NEXT: v_or_b32_e32 v5, v5, v23 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v53, v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v5, v5, v25 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v51, v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v49, v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v7, v7, v17 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_or_b32_e32 v5, v5, v34 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 -; SI-NEXT: v_or_b32_e32 v5, v5, v21 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v5, v5, v11 -; SI-NEXT: v_or_b32_e32 v56, v7, v13 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v23, v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 -; SI-NEXT: v_or_b32_e32 v36, v1, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 -; SI-NEXT: v_or_b32_e32 v37, v28, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v48, v3, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v39, v5, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v28 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v35, v3, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_or_b32_e32 v33, v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v30 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; SI-NEXT: v_or_b32_e32 v31, v29, v1 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 -; SI-NEXT: v_or_b32_e32 v2, v2, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v57 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_or_b32_e32 v4, v4, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v57 -; SI-NEXT: v_or_b32_e32 v6, v6, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 -; SI-NEXT: v_or_b32_e32 v8, v8, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v47 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 -; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v63 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v47 -; SI-NEXT: v_or_b32_e32 v12, v12, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v63 -; SI-NEXT: v_or_b32_e32 v14, v14, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v62 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 -; SI-NEXT: v_or_b32_e32 v18, v18, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v58 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 -; SI-NEXT: v_lshr_b64 v[50:51], v[17:18], 16 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v62 -; SI-NEXT: v_or_b32_e32 v22, v22, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v61 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v58 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v45 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v28 -; SI-NEXT: v_lshr_b64 v[54:55], v[25:26], 16 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v46 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v61 -; SI-NEXT: v_lshr_b64 v[52:53], v[21:22], 16 -; SI-NEXT: v_or_b32_e32 v16, v16, v28 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v45 -; SI-NEXT: v_or_b32_e32 v20, v20, v27 -; SI-NEXT: v_mov_b32_e32 v53, v33 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v46 -; SI-NEXT: v_or_b32_e32 v24, v24, v27 -; SI-NEXT: v_lshr_b64 v[43:44], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[33:34], v[7:8], 16 -; SI-NEXT: v_mov_b32_e32 v7, v56 -; SI-NEXT: v_lshr_b64 v[55:56], v[3:4], 16 -; SI-NEXT: v_mov_b32_e32 v44, v37 -; SI-NEXT: v_lshr_b64 v[41:42], v[19:20], 16 -; SI-NEXT: v_mov_b32_e32 v19, v39 -; SI-NEXT: v_lshr_b64 v[39:40], v[15:16], 16 -; SI-NEXT: v_mov_b32_e32 v15, v38 -; SI-NEXT: v_lshr_b64 v[37:38], v[11:12], 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v42, v36 -; SI-NEXT: v_mov_b32_e32 v40, v35 -; SI-NEXT: v_mov_b32_e32 v51, v32 -; SI-NEXT: v_lshr_b64 v[48:49], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[9:10], 16 -; SI-NEXT: v_mov_b32_e32 v34, v31 -; SI-NEXT: v_lshr_b64 v[31:32], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[1:2], 16 -; SI-NEXT: v_mov_b32_e32 v32, v29 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v27, v20, v5 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v33 +; SI-NEXT: v_or_b32_e32 v25, v20, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v56 +; SI-NEXT: v_or_b32_e32 v2, v2, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v24 +; SI-NEXT: v_or_b32_e32 v4, v4, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v61 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_or_b32_e32 v6, v6, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v61 +; SI-NEXT: v_or_b32_e32 v8, v8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v63 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v26 +; SI-NEXT: v_or_b32_e32 v10, v10, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v21 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v63 +; SI-NEXT: v_or_b32_e32 v12, v12, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v58 +; SI-NEXT: v_or_b32_e32 v31, v20, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v57 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v59 +; SI-NEXT: v_or_b32_e32 v33, v21, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v57 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v35, v19, v20 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v60 +; SI-NEXT: v_or_b32_e32 v37, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 +; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v20 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v60 +; SI-NEXT: v_or_b32_e32 v14, v14, v20 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_or_b32_e32 v16, v16, v19 +; SI-NEXT: v_lshr_b64 v[42:43], v[15:16], 16 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v46 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_lshr_b64 v[19:20], v[1:2], 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[44:45], v[17:18], 16 +; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_lshr_b64 v[27:28], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v45, v26 +; SI-NEXT: v_mov_b32_e32 v28, v25 +; SI-NEXT: v_lshr_b64 v[25:26], v[36:37], 16 +; SI-NEXT: v_mov_b32_e32 v15, v53 +; SI-NEXT: v_mov_b32_e32 v13, v51 +; SI-NEXT: v_mov_b32_e32 v26, v38 +; SI-NEXT: v_lshr_b64 v[40:41], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[30:31], 16 +; SI-NEXT: v_mov_b32_e32 v30, v49 +; SI-NEXT: v_lshr_b64 v[50:51], v[11:12], 16 +; SI-NEXT: v_mov_b32_e32 v11, v48 +; SI-NEXT: v_lshr_b64 v[48:49], v[9:10], 16 +; SI-NEXT: v_mov_b32_e32 v9, v39 +; SI-NEXT: v_lshr_b64 v[38:39], v[7:8], 16 +; SI-NEXT: v_mov_b32_e32 v7, v23 +; SI-NEXT: v_lshr_b64 v[22:23], v[3:4], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v44 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v29 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v60 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v42 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v54 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index 0e7bca4f61bfb..fc760adf05244 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -2503,13 +2503,13 @@ define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v6, s22 ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 @@ -2561,13 +2561,13 @@ define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v6, s22 ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB11_4 @@ -2619,13 +2619,13 @@ define inreg <28 x i32> @bitcast_v14f64_to_v28i32_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v6, s22 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 @@ -3768,6 +3768,11 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[18:19], 16 ; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 ; SI-NEXT: s_lshr_b32 s30, s5, 16 ; SI-NEXT: s_lshr_b32 s31, s7, 16 ; SI-NEXT: s_lshr_b32 s34, s9, 16 @@ -3782,43 +3787,40 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s51, s41, 16 ; SI-NEXT: s_lshr_b32 s52, s43, 16 ; SI-NEXT: s_lshr_b32 s53, s45, 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s27, s92, 16 -; SI-NEXT: s_and_b32 s29, s44, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s44, 0xffff +; SI-NEXT: s_lshl_b32 s29, s92, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v1, s27 ; SI-NEXT: s_and_b32 s27, s45, 0xffff ; SI-NEXT: s_lshl_b32 s29, s53, 16 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s90, 16 -; SI-NEXT: s_and_b32 s29, s42, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: s_and_b32 s27, s43, 0xffff -; SI-NEXT: s_lshl_b32 s29, s52, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 +; SI-NEXT: s_and_b32 s27, s42, 0xffff +; SI-NEXT: s_lshl_b32 s29, s90, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_lshl_b32 s27, s88, 16 -; SI-NEXT: s_and_b32 s29, s40, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_lshl_b32 s29, s52, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s40, 0xffff +; SI-NEXT: s_lshl_b32 s29, s88, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s27 ; SI-NEXT: s_and_b32 s27, s41, 0xffff ; SI-NEXT: s_lshl_b32 s29, s51, 16 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -5403,10 +5405,10 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v27, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v59 ; VI-NEXT: v_add_u16_sdwa v1, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v58 -; VI-NEXT: v_add_u16_sdwa v3, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v1, 3, v58 +; VI-NEXT: v_add_u16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v57 ; VI-NEXT: v_add_u16_sdwa v3, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -5591,8 +5593,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -5620,8 +5622,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -5671,8 +5673,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 @@ -5762,8 +5764,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -6041,154 +6043,150 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v10 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v46, v6 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: v_mov_b32_e32 v57, v4 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v58, v10 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_mov_b32_e32 v33, v6 -; SI-NEXT: v_mov_b32_e32 v35, v4 -; SI-NEXT: v_mov_b32_e32 v39, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: v_mov_b32_e32 v58, v2 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 ; SI-NEXT: v_mov_b32_e32 v31, v26 -; SI-NEXT: v_mov_b32_e32 v41, v24 -; SI-NEXT: v_mov_b32_e32 v42, v22 -; SI-NEXT: v_mov_b32_e32 v43, v20 -; SI-NEXT: v_mov_b32_e32 v49, v18 -; SI-NEXT: v_mov_b32_e32 v44, v16 -; SI-NEXT: v_mov_b32_e32 v45, v14 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v51, v22 +; SI-NEXT: v_mov_b32_e32 v40, v20 +; SI-NEXT: v_mov_b32_e32 v41, v18 +; SI-NEXT: v_mov_b32_e32 v42, v16 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v7, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v11, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v7, v0, v14 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v12, v0, v3 +; SI-NEXT: v_or_b32_e32 v8, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v10, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v11, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v13, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 -; SI-NEXT: v_or_b32_e32 v15, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v17, v0, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v19, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v16, v0, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v17, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v0, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v55 +; SI-NEXT: v_or_b32_e32 v19, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v34 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v52 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: v_or_b32_e32 v26, v0, v60 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_or_b32_e32 v27, v0, v29 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s12 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -6225,100 +6223,101 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 @@ -6344,83 +6343,83 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v58, v47 ; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: v_mov_b32_e32 v41, v30 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v62, v59 ; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mov_b32_e32 v56, v50 -; SI-NEXT: v_mov_b32_e32 v50, v45 -; SI-NEXT: v_mov_b32_e32 v45, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: v_mov_b32_e32 v45, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v61, v48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v39, v37 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v36, v34 +; SI-NEXT: v_mov_b32_e32 v35, v33 +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: v_mov_b32_e32 v60, v57 ; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v49 -; SI-NEXT: v_mov_b32_e32 v49, v43 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v53, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mov_b32_e32 v46, v55 +; SI-NEXT: v_mov_b32_e32 v55, v43 +; SI-NEXT: v_mov_b32_e32 v43, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v53 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v49 -; SI-NEXT: v_mov_b32_e32 v49, v46 +; SI-NEXT: v_mov_b32_e32 v29, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v43 +; SI-NEXT: v_mov_b32_e32 v43, v55 +; SI-NEXT: v_mov_b32_e32 v55, v46 ; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v45 -; SI-NEXT: v_mov_b32_e32 v45, v50 -; SI-NEXT: v_mov_b32_e32 v50, v56 +; SI-NEXT: v_mov_b32_e32 v57, v60 +; SI-NEXT: v_mov_b32_e32 v60, v33 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v34, v36 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v48, v61 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: v_mov_b32_e32 v42, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v45 +; SI-NEXT: v_mov_b32_e32 v45, v56 ; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: v_mov_b32_e32 v59, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v41 +; SI-NEXT: v_mov_b32_e32 v59, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: v_mov_b32_e32 v50, v41 ; SI-NEXT: v_mov_b32_e32 v41, v44 ; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v47, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v56i16_to_v28i32_scalar: @@ -6472,61 +6471,61 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -6549,11 +6548,11 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -6978,61 +6977,77 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 ; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] @@ -7043,22 +7058,6 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB15_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -10216,46 +10215,54 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 @@ -10271,157 +10278,145 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -10661,10 +10656,10 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v27, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v59 -; VI-NEXT: v_add_f16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v58 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v1, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v58 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_add_f16_sdwa v2, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, 0x200, v57 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -11311,61 +11306,64 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 @@ -11376,25 +11374,24 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v60 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 @@ -11419,7 +11416,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s21 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -11429,53 +11426,53 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_mov_b32_e32 v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_mov_b32_e32 v36, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_mov_b32_e32 v61, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: v_mov_b32_e32 v49, v11 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 +; SI-NEXT: v_mov_b32_e32 v29, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_or_b32_e32 v7, v61, v7 +; SI-NEXT: v_mov_b32_e32 v41, v58 ; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_mov_b32_e32 v41, v60 -; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_or_b32_e32 v9, v45, v9 ; SI-NEXT: v_mov_b32_e32 v40, v56 ; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v38, v12 -; SI-NEXT: v_or_b32_e32 v13, v36, v13 -; SI-NEXT: v_or_b32_e32 v14, v35, v14 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_or_b32_e32 v17, v37, v17 +; SI-NEXT: v_or_b32_e32 v11, v63, v11 +; SI-NEXT: v_or_b32_e32 v12, v29, v12 +; SI-NEXT: v_or_b32_e32 v13, v38, v13 +; SI-NEXT: v_or_b32_e32 v14, v34, v14 +; SI-NEXT: v_or_b32_e32 v15, v37, v15 +; SI-NEXT: v_or_b32_e32 v17, v35, v17 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -11491,11 +11488,11 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -11519,57 +11516,59 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v40 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -11582,121 +11581,122 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v42 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v33 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -11715,7 +11715,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 @@ -11745,19 +11745,14 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -11820,19 +11815,19 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_mov_b32_e32 v52, v37 -; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_mov_b32_e32 v49, v11 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: v_mov_b32_e32 v39, v3 +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v52, v35 +; SI-NEXT: v_mov_b32_e32 v35, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_mov_b32_e32 v41, v58 ; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_mov_b32_e32 v29, v37 -; SI-NEXT: v_mov_b32_e32 v37, v52 -; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_mov_b32_e32 v30, v35 +; SI-NEXT: v_mov_b32_e32 v35, v52 +; SI-NEXT: v_mov_b32_e32 v29, v44 ; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v56f16_to_v28i32_scalar: @@ -11884,61 +11879,61 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -11959,13 +11954,13 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; VI-NEXT: .LBB19_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -12196,8 +12191,8 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 -; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 @@ -12351,61 +12346,77 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 ; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] @@ -12416,22 +12427,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB19_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -14228,13 +14223,13 @@ define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg % ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v6, s22 ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 @@ -14286,13 +14281,13 @@ define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg % ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v6, s22 ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB27_4 @@ -14344,13 +14339,13 @@ define inreg <28 x float> @bitcast_v14f64_to_v28f32_scalar(<14 x double> inreg % ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v6, s22 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 @@ -15340,6 +15335,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v23, s18 ; SI-NEXT: v_mov_b32_e32 v24, s19 ; SI-NEXT: v_mov_b32_e32 v25, s20 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v26, s21 ; SI-NEXT: v_mov_b32_e32 v21, s22 ; SI-NEXT: v_mov_b32_e32 v22, s23 @@ -15347,58 +15343,56 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v20, s25 ; SI-NEXT: v_mov_b32_e32 v17, s26 ; SI-NEXT: v_mov_b32_e32 v18, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v15, s28 ; SI-NEXT: v_mov_b32_e32 v16, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 ; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 ; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 ; SI-NEXT: v_lshr_b64 v[32:33], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 ; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 ; SI-NEXT: v_lshr_b64 v[34:35], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshr_b64 v[35:36], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 -; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 +; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[27:28], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 @@ -15409,153 +15403,153 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_lshr_b64 v[32:33], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[34:35], v[3:4], 16 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_lshr_b64 v[34:35], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_lshr_b64 v[35:36], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[21:22], 16 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshr_b64 v[35:36], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[27:28], 16 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshr_b64 v[36:37], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshr_b64 v[37:38], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v52 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v27, v37 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; SI-NEXT: v_or_b32_e32 v27, v27, v39 ; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v58 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v51 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 ; SI-NEXT: v_or_b32_e32 v23, v23, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v57 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v49 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v56 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 ; SI-NEXT: v_or_b32_e32 v21, v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v38 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v46 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v37 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v45 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v44 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -15567,7 +15561,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -15579,7 +15573,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -15591,7 +15585,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -15603,7 +15597,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -15615,7 +15609,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -15627,27 +15621,24 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr50 @@ -15656,24 +15647,26 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v28f32_to_v56i16_scalar: @@ -17136,10 +17129,10 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v27, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v59 ; VI-NEXT: v_add_u16_sdwa v1, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v58 -; VI-NEXT: v_add_u16_sdwa v3, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v1, 3, v58 +; VI-NEXT: v_add_u16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v57 ; VI-NEXT: v_add_u16_sdwa v3, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -17324,8 +17317,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -17353,8 +17346,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -17404,8 +17397,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 @@ -17495,8 +17488,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -17774,154 +17767,150 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v10 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v46, v6 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: v_mov_b32_e32 v57, v4 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v58, v10 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_mov_b32_e32 v33, v6 -; SI-NEXT: v_mov_b32_e32 v35, v4 -; SI-NEXT: v_mov_b32_e32 v39, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: v_mov_b32_e32 v58, v2 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 ; SI-NEXT: v_mov_b32_e32 v31, v26 -; SI-NEXT: v_mov_b32_e32 v41, v24 -; SI-NEXT: v_mov_b32_e32 v42, v22 -; SI-NEXT: v_mov_b32_e32 v43, v20 -; SI-NEXT: v_mov_b32_e32 v49, v18 -; SI-NEXT: v_mov_b32_e32 v44, v16 -; SI-NEXT: v_mov_b32_e32 v45, v14 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v51, v22 +; SI-NEXT: v_mov_b32_e32 v40, v20 +; SI-NEXT: v_mov_b32_e32 v41, v18 +; SI-NEXT: v_mov_b32_e32 v42, v16 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v7, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v11, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v7, v0, v14 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v12, v0, v3 +; SI-NEXT: v_or_b32_e32 v8, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v10, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v11, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v13, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 -; SI-NEXT: v_or_b32_e32 v15, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v17, v0, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v19, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v16, v0, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v17, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v0, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v55 +; SI-NEXT: v_or_b32_e32 v19, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v34 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v52 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: v_or_b32_e32 v26, v0, v60 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_or_b32_e32 v27, v0, v29 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s12 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -17958,100 +17947,101 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 @@ -18077,83 +18067,83 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v58, v47 ; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: v_mov_b32_e32 v41, v30 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v62, v59 ; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mov_b32_e32 v56, v50 -; SI-NEXT: v_mov_b32_e32 v50, v45 -; SI-NEXT: v_mov_b32_e32 v45, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: v_mov_b32_e32 v45, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v61, v48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v39, v37 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v36, v34 +; SI-NEXT: v_mov_b32_e32 v35, v33 +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: v_mov_b32_e32 v60, v57 ; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v49 -; SI-NEXT: v_mov_b32_e32 v49, v43 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v53, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mov_b32_e32 v46, v55 +; SI-NEXT: v_mov_b32_e32 v55, v43 +; SI-NEXT: v_mov_b32_e32 v43, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v53 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v49 -; SI-NEXT: v_mov_b32_e32 v49, v46 +; SI-NEXT: v_mov_b32_e32 v29, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v43 +; SI-NEXT: v_mov_b32_e32 v43, v55 +; SI-NEXT: v_mov_b32_e32 v55, v46 ; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v45 -; SI-NEXT: v_mov_b32_e32 v45, v50 -; SI-NEXT: v_mov_b32_e32 v50, v56 +; SI-NEXT: v_mov_b32_e32 v57, v60 +; SI-NEXT: v_mov_b32_e32 v60, v33 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v34, v36 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v48, v61 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: v_mov_b32_e32 v42, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v45 +; SI-NEXT: v_mov_b32_e32 v45, v56 ; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: v_mov_b32_e32 v59, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v41 +; SI-NEXT: v_mov_b32_e32 v59, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: v_mov_b32_e32 v50, v41 ; SI-NEXT: v_mov_b32_e32 v41, v44 ; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v47, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v56i16_to_v28f32_scalar: @@ -18205,61 +18195,61 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -18282,11 +18272,11 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -18711,61 +18701,77 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 ; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] @@ -18776,22 +18782,6 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB31_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -20278,14 +20268,14 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v25, s16 ; SI-NEXT: v_mov_b32_e32 v23, s17 ; SI-NEXT: v_mov_b32_e32 v22, s18 -; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_mov_b32_e32 v21, s19 ; SI-NEXT: v_mov_b32_e32 v34, s20 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v35, s21 ; SI-NEXT: v_mov_b32_e32 v33, s22 ; SI-NEXT: v_mov_b32_e32 v32, s23 ; SI-NEXT: v_mov_b32_e32 v31, s24 -; SI-NEXT: v_mov_b32_e32 v29, s25 +; SI-NEXT: v_mov_b32_e32 v30, s25 ; SI-NEXT: v_mov_b32_e32 v28, s26 ; SI-NEXT: v_mov_b32_e32 v27, s27 ; SI-NEXT: v_mov_b32_e32 v26, s28 @@ -20315,7 +20305,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 @@ -20327,11 +20317,11 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v14 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill @@ -20343,14 +20333,14 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v8 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 @@ -20373,7 +20363,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -20400,8 +20390,8 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -20419,21 +20409,21 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v62, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v35 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v25 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 @@ -20457,7 +20447,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -20466,29 +20456,29 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v4 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v3 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v20, 1.0, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v21 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v35 ; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 ; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 ; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 -; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -20501,22 +20491,22 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v32 ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 @@ -20535,20 +20525,20 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -20557,21 +20547,21 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v39, v13 ; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill @@ -20585,9 +20575,9 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: .LBB33_3: ; %end ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v20 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -20596,7 +20586,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 @@ -20626,7 +20616,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -20653,7 +20643,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -20738,7 +20728,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -20749,7 +20739,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -20760,7 +20750,7 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -20790,9 +20780,9 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -20847,19 +20837,19 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: ; kill: killed $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr43 @@ -20868,13 +20858,13 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr39 @@ -22124,46 +22114,54 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 @@ -22179,157 +22177,145 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -22569,10 +22555,10 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v27, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v59 -; VI-NEXT: v_add_f16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v58 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v1, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v58 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_add_f16_sdwa v2, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, 0x200, v57 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -23219,61 +23205,64 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 @@ -23284,25 +23273,24 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v60 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 @@ -23327,7 +23315,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s21 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -23337,53 +23325,53 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_mov_b32_e32 v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_mov_b32_e32 v36, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_mov_b32_e32 v61, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: v_mov_b32_e32 v49, v11 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 +; SI-NEXT: v_mov_b32_e32 v29, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_or_b32_e32 v7, v61, v7 +; SI-NEXT: v_mov_b32_e32 v41, v58 ; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_mov_b32_e32 v41, v60 -; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_or_b32_e32 v9, v45, v9 ; SI-NEXT: v_mov_b32_e32 v40, v56 ; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v38, v12 -; SI-NEXT: v_or_b32_e32 v13, v36, v13 -; SI-NEXT: v_or_b32_e32 v14, v35, v14 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_or_b32_e32 v17, v37, v17 +; SI-NEXT: v_or_b32_e32 v11, v63, v11 +; SI-NEXT: v_or_b32_e32 v12, v29, v12 +; SI-NEXT: v_or_b32_e32 v13, v38, v13 +; SI-NEXT: v_or_b32_e32 v14, v34, v14 +; SI-NEXT: v_or_b32_e32 v15, v37, v15 +; SI-NEXT: v_or_b32_e32 v17, v35, v17 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -23399,11 +23387,11 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -23427,57 +23415,59 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v40 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -23490,121 +23480,122 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v42 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v33 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -23623,7 +23614,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 @@ -23653,19 +23644,14 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -23728,19 +23714,19 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_mov_b32_e32 v52, v37 -; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_mov_b32_e32 v49, v11 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: v_mov_b32_e32 v39, v3 +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v52, v35 +; SI-NEXT: v_mov_b32_e32 v35, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_mov_b32_e32 v41, v58 ; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_mov_b32_e32 v29, v37 -; SI-NEXT: v_mov_b32_e32 v37, v52 -; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_mov_b32_e32 v30, v35 +; SI-NEXT: v_mov_b32_e32 v35, v52 +; SI-NEXT: v_mov_b32_e32 v29, v44 ; SI-NEXT: s_branch .LBB35_2 ; ; VI-LABEL: bitcast_v56f16_to_v28f32_scalar: @@ -23792,61 +23778,61 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -23867,13 +23853,13 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -24104,8 +24090,8 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 -; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 @@ -24259,61 +24245,77 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 ; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] @@ -24324,22 +24326,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB35_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -25248,13 +25234,13 @@ define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v3, s19 ; SI-NEXT: v_mov_b32_e32 v4, s20 ; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v6, s22 ; SI-NEXT: v_mov_b32_e32 v7, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 @@ -25306,13 +25292,13 @@ define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v3, s19 ; VI-NEXT: v_mov_b32_e32 v4, s20 ; VI-NEXT: v_mov_b32_e32 v5, s21 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v6, s22 ; VI-NEXT: v_mov_b32_e32 v7, s23 ; VI-NEXT: v_mov_b32_e32 v8, s24 ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB39_4 @@ -25364,13 +25350,13 @@ define inreg <14 x i64> @bitcast_v14f64_to_v14i64_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v6, s22 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_mov_b32_e32 v8, s24 ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 @@ -26547,37 +26533,39 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[90:91], s[42:43], 16 ; SI-NEXT: s_lshr_b64 s[92:93], s[44:45], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_lshl_b32 s27, s92, 16 -; SI-NEXT: s_and_b32 s29, s44, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s44, 0xffff +; SI-NEXT: s_lshl_b32 s29, s92, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v1, s27 ; SI-NEXT: s_and_b32 s27, s45, 0xffff ; SI-NEXT: s_lshl_b32 s29, s53, 16 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s90, 16 -; SI-NEXT: s_and_b32 s29, s42, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: s_and_b32 s27, s43, 0xffff -; SI-NEXT: s_lshl_b32 s29, s52, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 +; SI-NEXT: s_and_b32 s27, s42, 0xffff +; SI-NEXT: s_lshl_b32 s29, s90, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_lshl_b32 s27, s88, 16 -; SI-NEXT: s_and_b32 s29, s40, 0xffff +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_lshl_b32 s29, s52, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s40, 0xffff +; SI-NEXT: s_lshl_b32 s29, s88, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s27 ; SI-NEXT: s_and_b32 s27, s41, 0xffff ; SI-NEXT: s_lshl_b32 s29, s51, 16 -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -28162,10 +28150,10 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v27, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v59 ; VI-NEXT: v_add_u16_sdwa v1, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v58 -; VI-NEXT: v_add_u16_sdwa v3, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v1, 3, v58 +; VI-NEXT: v_add_u16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v57 ; VI-NEXT: v_add_u16_sdwa v3, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -28350,8 +28338,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -28379,8 +28367,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -28430,8 +28418,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 @@ -28521,8 +28509,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -28800,154 +28788,150 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v10 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v46, v6 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: v_mov_b32_e32 v57, v4 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v58, v10 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_mov_b32_e32 v33, v6 -; SI-NEXT: v_mov_b32_e32 v35, v4 -; SI-NEXT: v_mov_b32_e32 v39, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: v_mov_b32_e32 v58, v2 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 ; SI-NEXT: v_mov_b32_e32 v31, v26 -; SI-NEXT: v_mov_b32_e32 v41, v24 -; SI-NEXT: v_mov_b32_e32 v42, v22 -; SI-NEXT: v_mov_b32_e32 v43, v20 -; SI-NEXT: v_mov_b32_e32 v49, v18 -; SI-NEXT: v_mov_b32_e32 v44, v16 -; SI-NEXT: v_mov_b32_e32 v45, v14 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v51, v22 +; SI-NEXT: v_mov_b32_e32 v40, v20 +; SI-NEXT: v_mov_b32_e32 v41, v18 +; SI-NEXT: v_mov_b32_e32 v42, v16 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v7, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v11, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v7, v0, v14 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v12, v0, v3 +; SI-NEXT: v_or_b32_e32 v8, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v10, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v11, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v13, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 -; SI-NEXT: v_or_b32_e32 v15, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v17, v0, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v19, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v16, v0, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v17, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v0, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v55 +; SI-NEXT: v_or_b32_e32 v19, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v34 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v52 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: v_or_b32_e32 v26, v0, v60 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_or_b32_e32 v27, v0, v29 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s12 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -28984,100 +28968,101 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 @@ -29103,83 +29088,83 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v58, v47 ; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: v_mov_b32_e32 v41, v30 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v62, v59 ; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mov_b32_e32 v56, v50 -; SI-NEXT: v_mov_b32_e32 v50, v45 -; SI-NEXT: v_mov_b32_e32 v45, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: v_mov_b32_e32 v45, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v61, v48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v39, v37 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v36, v34 +; SI-NEXT: v_mov_b32_e32 v35, v33 +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: v_mov_b32_e32 v60, v57 ; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v49 -; SI-NEXT: v_mov_b32_e32 v49, v43 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v53, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mov_b32_e32 v46, v55 +; SI-NEXT: v_mov_b32_e32 v55, v43 +; SI-NEXT: v_mov_b32_e32 v43, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v53 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v49 -; SI-NEXT: v_mov_b32_e32 v49, v46 +; SI-NEXT: v_mov_b32_e32 v29, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v43 +; SI-NEXT: v_mov_b32_e32 v43, v55 +; SI-NEXT: v_mov_b32_e32 v55, v46 ; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v45 -; SI-NEXT: v_mov_b32_e32 v45, v50 -; SI-NEXT: v_mov_b32_e32 v50, v56 +; SI-NEXT: v_mov_b32_e32 v57, v60 +; SI-NEXT: v_mov_b32_e32 v60, v33 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v34, v36 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v48, v61 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: v_mov_b32_e32 v42, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v45 +; SI-NEXT: v_mov_b32_e32 v45, v56 ; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: v_mov_b32_e32 v59, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v41 +; SI-NEXT: v_mov_b32_e32 v59, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: v_mov_b32_e32 v50, v41 ; SI-NEXT: v_mov_b32_e32 v41, v44 ; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v47, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v56i16_to_v14i64_scalar: @@ -29231,61 +29216,61 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -29308,11 +29293,11 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -29737,61 +29722,77 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 ; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] @@ -29802,22 +29803,6 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB43_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -32989,46 +32974,54 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 @@ -33044,157 +33037,145 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -33434,10 +33415,10 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v27, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v59 -; VI-NEXT: v_add_f16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v58 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v1, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v58 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_add_f16_sdwa v2, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, 0x200, v57 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -34084,61 +34065,64 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 @@ -34149,25 +34133,24 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v60 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 @@ -34192,7 +34175,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s21 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -34202,53 +34185,53 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_mov_b32_e32 v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_mov_b32_e32 v36, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_mov_b32_e32 v61, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: v_mov_b32_e32 v49, v11 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 +; SI-NEXT: v_mov_b32_e32 v29, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_or_b32_e32 v7, v61, v7 +; SI-NEXT: v_mov_b32_e32 v41, v58 ; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_mov_b32_e32 v41, v60 -; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_or_b32_e32 v9, v45, v9 ; SI-NEXT: v_mov_b32_e32 v40, v56 ; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v38, v12 -; SI-NEXT: v_or_b32_e32 v13, v36, v13 -; SI-NEXT: v_or_b32_e32 v14, v35, v14 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_or_b32_e32 v17, v37, v17 +; SI-NEXT: v_or_b32_e32 v11, v63, v11 +; SI-NEXT: v_or_b32_e32 v12, v29, v12 +; SI-NEXT: v_or_b32_e32 v13, v38, v13 +; SI-NEXT: v_or_b32_e32 v14, v34, v14 +; SI-NEXT: v_or_b32_e32 v15, v37, v15 +; SI-NEXT: v_or_b32_e32 v17, v35, v17 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -34264,11 +34247,11 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -34292,57 +34275,59 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v40 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -34355,121 +34340,122 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v42 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v33 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -34488,7 +34474,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 @@ -34518,19 +34504,14 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -34593,19 +34574,19 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_mov_b32_e32 v52, v37 -; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_mov_b32_e32 v49, v11 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: v_mov_b32_e32 v39, v3 +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v52, v35 +; SI-NEXT: v_mov_b32_e32 v35, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_mov_b32_e32 v41, v58 ; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_mov_b32_e32 v29, v37 -; SI-NEXT: v_mov_b32_e32 v37, v52 -; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_mov_b32_e32 v30, v35 +; SI-NEXT: v_mov_b32_e32 v35, v52 +; SI-NEXT: v_mov_b32_e32 v29, v44 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v56f16_to_v14i64_scalar: @@ -34657,61 +34638,61 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -34732,13 +34713,13 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -34969,8 +34950,8 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 -; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 @@ -35124,61 +35105,77 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 ; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] @@ -35189,22 +35186,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB47_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -36334,6 +36315,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v23, s18 ; SI-NEXT: v_mov_b32_e32 v24, s19 ; SI-NEXT: v_mov_b32_e32 v25, s20 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v26, s21 ; SI-NEXT: v_mov_b32_e32 v21, s22 ; SI-NEXT: v_mov_b32_e32 v22, s23 @@ -36341,58 +36323,56 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v20, s25 ; SI-NEXT: v_mov_b32_e32 v17, s26 ; SI-NEXT: v_mov_b32_e32 v18, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v15, s28 ; SI-NEXT: v_mov_b32_e32 v16, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 ; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 ; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 ; SI-NEXT: v_lshr_b64 v[32:33], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 ; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 ; SI-NEXT: v_lshr_b64 v[34:35], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshr_b64 v[35:36], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 -; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 +; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[27:28], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 @@ -36400,142 +36380,142 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; SI-NEXT: v_lshr_b64 v[32:33], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_lshr_b64 v[34:35], v[3:4], 16 ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshr_b64 v[35:36], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[21:22], 16 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_lshr_b64 v[34:35], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[35:36], v[15:16], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 -; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[27:28], 16 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: v_lshr_b64 v[36:37], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v52 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v27, v37 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; SI-NEXT: v_or_b32_e32 v27, v27, v39 ; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v58 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v51 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 ; SI-NEXT: v_or_b32_e32 v23, v23, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v57 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v49 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v56 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 ; SI-NEXT: v_or_b32_e32 v21, v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v38 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v46 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v37 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v45 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v44 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -36547,7 +36527,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -36559,7 +36539,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -36571,7 +36551,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -36583,7 +36563,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -36595,7 +36575,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -36607,27 +36587,24 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr50 @@ -36636,43 +36613,45 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v14f64_to_v56i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; VI-NEXT: v_mov_b32_e32 v17, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v15, s18 -; VI-NEXT: v_mov_b32_e32 v16, s19 -; VI-NEXT: v_mov_b32_e32 v32, s20 -; VI-NEXT: v_mov_b32_e32 v33, s21 -; VI-NEXT: v_mov_b32_e32 v25, s22 -; VI-NEXT: v_mov_b32_e32 v26, s23 -; VI-NEXT: v_mov_b32_e32 v23, s24 -; VI-NEXT: v_mov_b32_e32 v24, s25 -; VI-NEXT: v_mov_b32_e32 v21, s26 -; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: v_mov_b32_e32 v21, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s21 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v34, s22 +; VI-NEXT: v_mov_b32_e32 v35, s23 +; VI-NEXT: v_mov_b32_e32 v25, s24 +; VI-NEXT: v_mov_b32_e32 v26, s25 +; VI-NEXT: v_mov_b32_e32 v23, s26 +; VI-NEXT: v_mov_b32_e32 v24, s27 ; VI-NEXT: v_mov_b32_e32 v19, s28 ; VI-NEXT: v_mov_b32_e32 v20, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill @@ -36701,18 +36680,18 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; VI-NEXT: s_cbranch_execnz .LBB49_3 ; VI-NEXT: .LBB49_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -36723,12 +36702,12 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 ; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 @@ -36745,43 +36724,43 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; VI-NEXT: .LBB49_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v17, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 -; VI-NEXT: v_or_b32_sdwa v30, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; VI-NEXT: v_or_b32_sdwa v31, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v21, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v47 -; VI-NEXT: v_or_b32_sdwa v32, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v34, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 -; VI-NEXT: v_or_b32_sdwa v33, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 -; VI-NEXT: v_or_b32_sdwa v38, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v39 -; VI-NEXT: v_or_b32_sdwa v39, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_or_b32_sdwa v48, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -36843,10 +36822,10 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr38 @@ -36873,19 +36852,19 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GFX9-NEXT: v_mov_b32_e32 v17, s16 -; GFX9-NEXT: v_mov_b32_e32 v18, s17 -; GFX9-NEXT: v_mov_b32_e32 v15, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: v_mov_b32_e32 v32, s20 -; GFX9-NEXT: v_mov_b32_e32 v33, s21 -; GFX9-NEXT: v_mov_b32_e32 v25, s22 -; GFX9-NEXT: v_mov_b32_e32 v26, s23 -; GFX9-NEXT: v_mov_b32_e32 v23, s24 -; GFX9-NEXT: v_mov_b32_e32 v24, s25 -; GFX9-NEXT: v_mov_b32_e32 v21, s26 -; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: v_mov_b32_e32 v21, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s21 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v34, s22 +; GFX9-NEXT: v_mov_b32_e32 v35, s23 +; GFX9-NEXT: v_mov_b32_e32 v25, s24 +; GFX9-NEXT: v_mov_b32_e32 v26, s25 +; GFX9-NEXT: v_mov_b32_e32 v23, s26 +; GFX9-NEXT: v_mov_b32_e32 v24, s27 ; GFX9-NEXT: v_mov_b32_e32 v19, s28 ; GFX9-NEXT: v_mov_b32_e32 v20, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill @@ -36914,18 +36893,18 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; GFX9-NEXT: s_cbranch_execnz .LBB49_3 ; GFX9-NEXT: .LBB49_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -36936,12 +36915,12 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 ; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 @@ -36958,38 +36937,34 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; GFX9-NEXT: .LBB49_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v32, v47, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v33 -; GFX9-NEXT: v_lshl_or_b32 v33, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v34, v47, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v35, v46, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 -; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 -; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 ; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v19 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -37001,11 +36976,11 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v45, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v17 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v16, v44, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v43, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_lshl_or_b32 v18, v42, 16, v0 @@ -37021,7 +36996,11 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v21, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GFX9-NEXT: v_lshl_or_b32 v22, v54, 16, v0 @@ -37056,10 +37035,10 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr38 @@ -38086,10 +38065,10 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v27, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v59 ; VI-NEXT: v_add_u16_sdwa v1, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v58 -; VI-NEXT: v_add_u16_sdwa v3, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v1, 3, v58 +; VI-NEXT: v_add_u16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v57 ; VI-NEXT: v_add_u16_sdwa v3, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -38274,8 +38253,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v46 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v47 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v57 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 ; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -38303,8 +38282,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v37 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -38354,8 +38333,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 @@ -38445,8 +38424,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 -; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 -; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 +; GFX9-NEXT: v_perm_b32 v1, v62, v58, s6 +; GFX9-NEXT: v_perm_b32 v2, v35, v57, s6 ; GFX9-NEXT: v_perm_b32 v3, v34, v56, s6 ; GFX9-NEXT: v_perm_b32 v4, v33, v47, s6 ; GFX9-NEXT: v_perm_b32 v5, v61, v46, s6 @@ -38724,154 +38703,150 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v53, v12 +; SI-NEXT: v_mov_b32_e32 v54, v10 +; SI-NEXT: v_mov_b32_e32 v43, v8 +; SI-NEXT: v_mov_b32_e32 v46, v6 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: v_mov_b32_e32 v57, v4 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v58, v10 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_mov_b32_e32 v33, v6 -; SI-NEXT: v_mov_b32_e32 v35, v4 -; SI-NEXT: v_mov_b32_e32 v39, v2 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v61, v0 +; SI-NEXT: v_mov_b32_e32 v58, v2 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 ; SI-NEXT: v_mov_b32_e32 v31, v26 -; SI-NEXT: v_mov_b32_e32 v41, v24 -; SI-NEXT: v_mov_b32_e32 v42, v22 -; SI-NEXT: v_mov_b32_e32 v43, v20 -; SI-NEXT: v_mov_b32_e32 v49, v18 -; SI-NEXT: v_mov_b32_e32 v44, v16 -; SI-NEXT: v_mov_b32_e32 v45, v14 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; SI-NEXT: v_mov_b32_e32 v50, v24 +; SI-NEXT: v_mov_b32_e32 v51, v22 +; SI-NEXT: v_mov_b32_e32 v40, v20 +; SI-NEXT: v_mov_b32_e32 v41, v18 +; SI-NEXT: v_mov_b32_e32 v42, v16 +; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v7, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v11, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v7, v0, v14 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v12, v0, v3 +; SI-NEXT: v_or_b32_e32 v8, v0, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v9, v0, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v10, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v11, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v13, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 -; SI-NEXT: v_or_b32_e32 v15, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v17, v0, v38 +; SI-NEXT: v_or_b32_e32 v14, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v37 +; SI-NEXT: v_or_b32_e32 v15, v0, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v19, v0, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_or_b32_e32 v16, v0, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v17, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v18, v0, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v55 +; SI-NEXT: v_or_b32_e32 v19, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: v_or_b32_e32 v20, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v34 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: v_or_b32_e32 v21, v0, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: v_or_b32_e32 v22, v0, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: v_or_b32_e32 v23, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v53 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v52 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: v_or_b32_e32 v24, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: v_or_b32_e32 v26, v0, v60 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_or_b32_e32 v27, v0, v29 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s12 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -38908,100 +38883,101 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v37, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v55, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v53, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v52, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 @@ -39027,83 +39003,83 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v51 -; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v58, v47 ; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: v_mov_b32_e32 v41, v30 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v41, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v62, v59 ; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mov_b32_e32 v56, v50 -; SI-NEXT: v_mov_b32_e32 v50, v45 -; SI-NEXT: v_mov_b32_e32 v45, v42 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v57 +; SI-NEXT: v_mov_b32_e32 v56, v45 +; SI-NEXT: v_mov_b32_e32 v45, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v52 +; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: v_mov_b32_e32 v42, v51 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v61, v48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v39, v37 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v36, v34 +; SI-NEXT: v_mov_b32_e32 v35, v33 +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: v_mov_b32_e32 v33, v60 +; SI-NEXT: v_mov_b32_e32 v60, v57 ; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v49 -; SI-NEXT: v_mov_b32_e32 v49, v43 -; SI-NEXT: v_mov_b32_e32 v43, v31 -; SI-NEXT: v_mov_b32_e32 v53, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v38 -; SI-NEXT: v_mov_b32_e32 v38, v37 -; SI-NEXT: v_mov_b32_e32 v37, v36 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v32 -; SI-NEXT: v_mov_b32_e32 v32, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mov_b32_e32 v46, v55 +; SI-NEXT: v_mov_b32_e32 v55, v43 +; SI-NEXT: v_mov_b32_e32 v43, v40 +; SI-NEXT: v_mov_b32_e32 v40, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v32 -; SI-NEXT: v_mov_b32_e32 v32, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v37 -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v53 -; SI-NEXT: v_mov_b32_e32 v31, v43 -; SI-NEXT: v_mov_b32_e32 v43, v49 -; SI-NEXT: v_mov_b32_e32 v49, v46 +; SI-NEXT: v_mov_b32_e32 v29, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v31, v40 +; SI-NEXT: v_mov_b32_e32 v40, v43 +; SI-NEXT: v_mov_b32_e32 v43, v55 +; SI-NEXT: v_mov_b32_e32 v55, v46 ; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v42 -; SI-NEXT: v_mov_b32_e32 v42, v45 -; SI-NEXT: v_mov_b32_e32 v45, v50 -; SI-NEXT: v_mov_b32_e32 v50, v56 +; SI-NEXT: v_mov_b32_e32 v57, v60 +; SI-NEXT: v_mov_b32_e32 v60, v33 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v34, v36 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v48, v61 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v51 +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: v_mov_b32_e32 v42, v52 +; SI-NEXT: v_mov_b32_e32 v52, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v45 +; SI-NEXT: v_mov_b32_e32 v45, v56 ; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: v_mov_b32_e32 v59, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v41 +; SI-NEXT: v_mov_b32_e32 v59, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v50 +; SI-NEXT: v_mov_b32_e32 v50, v41 ; SI-NEXT: v_mov_b32_e32 v41, v44 ; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v47, v51 -; SI-NEXT: v_mov_b32_e32 v51, v58 -; SI-NEXT: v_mov_b32_e32 v58, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v47, v58 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v56i16_to_v14f64_scalar: @@ -39155,61 +39131,61 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -39232,11 +39208,11 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v52 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -39661,61 +39637,77 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 ; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0] @@ -39726,22 +39718,6 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB51_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -40075,116 +40051,123 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; kill: killed $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v25 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -40192,40 +40175,34 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 -; SI-NEXT: v_mov_b32_e32 v29, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v29 +; SI-NEXT: v_mov_b32_e32 v29, v35 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 @@ -40244,56 +40221,60 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v3 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v21 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[53:54], v[1:2], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[54:55], v[1:2], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 @@ -40314,69 +40295,67 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v1 -; SI-NEXT: v_mov_b32_e32 v47, v25 -; SI-NEXT: v_mov_b32_e32 v45, v26 -; SI-NEXT: v_mov_b32_e32 v43, v27 -; SI-NEXT: v_mov_b32_e32 v42, v28 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: v_mov_b32_e32 v57, v25 +; SI-NEXT: v_mov_b32_e32 v47, v26 +; SI-NEXT: v_mov_b32_e32 v45, v27 +; SI-NEXT: v_mov_b32_e32 v43, v28 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -40385,45 +40364,52 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -40432,7 +40418,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -40441,7 +40427,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -40449,8 +40435,8 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -40459,7 +40445,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -40468,7 +40454,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -40477,7 +40463,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -40486,27 +40472,25 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -40515,9 +40499,9 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -40526,9 +40510,9 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -40536,45 +40520,40 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -40582,29 +40561,29 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -41168,19 +41147,19 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_mov_b32_e32 v25, s16 -; SI-NEXT: v_mov_b32_e32 v26, s17 +; SI-NEXT: v_mov_b32_e32 v23, s16 +; SI-NEXT: v_mov_b32_e32 v24, s17 ; SI-NEXT: v_mov_b32_e32 v21, s18 ; SI-NEXT: v_mov_b32_e32 v22, s19 ; SI-NEXT: v_mov_b32_e32 v27, s20 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v28, s21 -; SI-NEXT: v_mov_b32_e32 v23, s22 -; SI-NEXT: v_mov_b32_e32 v24, s23 +; SI-NEXT: v_mov_b32_e32 v25, s22 +; SI-NEXT: v_mov_b32_e32 v26, s23 ; SI-NEXT: v_mov_b32_e32 v19, s24 ; SI-NEXT: v_mov_b32_e32 v20, s25 ; SI-NEXT: v_mov_b32_e32 v17, s26 ; SI-NEXT: v_mov_b32_e32 v18, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v15, s28 ; SI-NEXT: v_mov_b32_e32 v16, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -41201,89 +41180,88 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v11 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v9 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v29 ; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v39, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v13 ; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v13 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v11 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 ; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v39, v2 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 ; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v18 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 @@ -41292,17 +41270,17 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v39, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 ; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v39, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 @@ -41314,19 +41292,20 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v39, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 ; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 @@ -41346,58 +41325,55 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v4 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 ; SI-NEXT: v_add_f64 v[53:54], v[21:22], 1.0 ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v24 ; SI-NEXT: v_mov_b32_e32 v45, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v53 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v54 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 @@ -41416,21 +41392,22 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 @@ -41438,7 +41415,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 @@ -41446,13 +41423,13 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v21 ; SI-NEXT: v_mov_b32_e32 v29, v14 -; SI-NEXT: v_mov_b32_e32 v57, v11 ; SI-NEXT: v_mov_b32_e32 v47, v12 ; SI-NEXT: v_mov_b32_e32 v43, v13 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill @@ -41477,7 +41454,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 @@ -41507,7 +41484,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -41539,7 +41516,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -41626,20 +41603,18 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -41673,11 +41648,13 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -41729,29 +41706,29 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -41803,19 +41780,19 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; VI-NEXT: v_mov_b32_e32 v17, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v15, s18 -; VI-NEXT: v_mov_b32_e32 v16, s19 -; VI-NEXT: v_mov_b32_e32 v32, s20 -; VI-NEXT: v_mov_b32_e32 v33, s21 -; VI-NEXT: v_mov_b32_e32 v25, s22 -; VI-NEXT: v_mov_b32_e32 v26, s23 -; VI-NEXT: v_mov_b32_e32 v23, s24 -; VI-NEXT: v_mov_b32_e32 v24, s25 -; VI-NEXT: v_mov_b32_e32 v21, s26 -; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: v_mov_b32_e32 v21, s16 +; VI-NEXT: v_mov_b32_e32 v22, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 +; VI-NEXT: v_mov_b32_e32 v15, s20 +; VI-NEXT: v_mov_b32_e32 v16, s21 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec +; VI-NEXT: v_mov_b32_e32 v34, s22 +; VI-NEXT: v_mov_b32_e32 v35, s23 +; VI-NEXT: v_mov_b32_e32 v25, s24 +; VI-NEXT: v_mov_b32_e32 v26, s25 +; VI-NEXT: v_mov_b32_e32 v23, s26 +; VI-NEXT: v_mov_b32_e32 v24, s27 ; VI-NEXT: v_mov_b32_e32 v19, s28 ; VI-NEXT: v_mov_b32_e32 v20, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill @@ -41844,18 +41821,18 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; VI-NEXT: s_cbranch_execnz .LBB53_3 ; VI-NEXT: .LBB53_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -41866,12 +41843,12 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 ; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 @@ -41888,43 +41865,43 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v20 ; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; VI-NEXT: .LBB53_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_or_b32_sdwa v28, v17, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 -; VI-NEXT: v_or_b32_sdwa v29, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 -; VI-NEXT: v_or_b32_sdwa v30, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; VI-NEXT: v_or_b32_sdwa v31, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v21, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 +; VI-NEXT: v_or_b32_sdwa v29, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v30 +; VI-NEXT: v_or_b32_sdwa v30, v17, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; VI-NEXT: v_or_b32_sdwa v31, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v32 +; VI-NEXT: v_or_b32_sdwa v32, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v47 -; VI-NEXT: v_or_b32_sdwa v32, v32, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v34, v34, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v46 -; VI-NEXT: v_or_b32_sdwa v33, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v26, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 -; VI-NEXT: v_or_b32_sdwa v38, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v39 -; VI-NEXT: v_or_b32_sdwa v39, v22, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v48 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_or_b32_sdwa v48, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -41986,10 +41963,10 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr47 ; VI-NEXT: ; implicit-def: $vgpr46 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr37 ; VI-NEXT: ; implicit-def: $vgpr38 @@ -42016,19 +41993,19 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GFX9-NEXT: v_mov_b32_e32 v17, s16 -; GFX9-NEXT: v_mov_b32_e32 v18, s17 -; GFX9-NEXT: v_mov_b32_e32 v15, s18 -; GFX9-NEXT: v_mov_b32_e32 v16, s19 -; GFX9-NEXT: v_mov_b32_e32 v32, s20 -; GFX9-NEXT: v_mov_b32_e32 v33, s21 -; GFX9-NEXT: v_mov_b32_e32 v25, s22 -; GFX9-NEXT: v_mov_b32_e32 v26, s23 -; GFX9-NEXT: v_mov_b32_e32 v23, s24 -; GFX9-NEXT: v_mov_b32_e32 v24, s25 -; GFX9-NEXT: v_mov_b32_e32 v21, s26 -; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: v_mov_b32_e32 v21, s16 +; GFX9-NEXT: v_mov_b32_e32 v22, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 +; GFX9-NEXT: v_mov_b32_e32 v15, s20 +; GFX9-NEXT: v_mov_b32_e32 v16, s21 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: v_mov_b32_e32 v34, s22 +; GFX9-NEXT: v_mov_b32_e32 v35, s23 +; GFX9-NEXT: v_mov_b32_e32 v25, s24 +; GFX9-NEXT: v_mov_b32_e32 v26, s25 +; GFX9-NEXT: v_mov_b32_e32 v23, s26 +; GFX9-NEXT: v_mov_b32_e32 v24, s27 ; GFX9-NEXT: v_mov_b32_e32 v19, s28 ; GFX9-NEXT: v_mov_b32_e32 v20, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill @@ -42057,18 +42034,18 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; GFX9-NEXT: s_cbranch_execnz .LBB53_3 ; GFX9-NEXT: .LBB53_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 @@ -42079,12 +42056,12 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 ; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v11 @@ -42101,38 +42078,34 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v33 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v21 ; GFX9-NEXT: .LBB53_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v32, v47, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v33 -; GFX9-NEXT: v_lshl_or_b32 v33, v46, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v34 +; GFX9-NEXT: v_lshl_or_b32 v34, v47, 16, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v35 +; GFX9-NEXT: v_lshl_or_b32 v35, v46, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v25 -; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 -; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v26 ; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v23 ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v19 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -42144,11 +42117,11 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v15, v45, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v30, v30, 16, v17 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v16, v44, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v43, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_lshl_or_b32 v18, v42, 16, v0 @@ -42164,7 +42137,11 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX9-NEXT: v_lshl_or_b32 v28, v28, 16, v21 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v22 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v29, v29, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v21, v55, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GFX9-NEXT: v_lshl_or_b32 v22, v54, 16, v0 @@ -42199,10 +42176,10 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; GFX9-NEXT: ; implicit-def: $vgpr29 ; GFX9-NEXT: ; implicit-def: $vgpr30 ; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr38 @@ -42975,46 +42952,54 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB54_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v42 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v41 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v60 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v58 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 @@ -43030,157 +43015,145 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -43420,10 +43393,10 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v27, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v59, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v59 -; VI-NEXT: v_add_f16_sdwa v2, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v58 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v1, v58, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v58 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_add_f16_sdwa v2, v57, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, 0x200, v57 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -44070,61 +44043,64 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v62, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s22 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 @@ -44135,25 +44111,24 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v30, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v60 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 @@ -44178,7 +44153,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s21 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -44188,53 +44163,53 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_mov_b32_e32 v39, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_mov_b32_e32 v36, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_mov_b32_e32 v61, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_or_b32_e32 v2, v11, v2 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: v_mov_b32_e32 v49, v11 +; SI-NEXT: v_or_b32_e32 v3, v11, v3 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 +; SI-NEXT: v_mov_b32_e32 v29, v44 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_or_b32_e32 v7, v47, v7 -; SI-NEXT: v_mov_b32_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_or_b32_e32 v7, v61, v7 +; SI-NEXT: v_mov_b32_e32 v41, v58 ; SI-NEXT: v_or_b32_e32 v8, v58, v8 -; SI-NEXT: v_mov_b32_e32 v41, v60 -; SI-NEXT: v_or_b32_e32 v9, v59, v9 +; SI-NEXT: v_or_b32_e32 v9, v45, v9 ; SI-NEXT: v_mov_b32_e32 v40, v56 ; SI-NEXT: v_or_b32_e32 v10, v56, v10 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: v_or_b32_e32 v12, v38, v12 -; SI-NEXT: v_or_b32_e32 v13, v36, v13 -; SI-NEXT: v_or_b32_e32 v14, v35, v14 -; SI-NEXT: v_or_b32_e32 v15, v32, v15 -; SI-NEXT: v_or_b32_e32 v17, v37, v17 +; SI-NEXT: v_or_b32_e32 v11, v63, v11 +; SI-NEXT: v_or_b32_e32 v12, v29, v12 +; SI-NEXT: v_or_b32_e32 v13, v38, v13 +; SI-NEXT: v_or_b32_e32 v14, v34, v14 +; SI-NEXT: v_or_b32_e32 v15, v37, v15 +; SI-NEXT: v_or_b32_e32 v17, v35, v17 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -44250,11 +44225,11 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -44278,57 +44253,59 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v43 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v40 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 @@ -44341,121 +44318,122 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v29 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v42 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v46 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v41 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v62 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v10, v46 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v63 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v33 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v34 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v16 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload @@ -44474,7 +44452,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 @@ -44504,19 +44482,14 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -44579,19 +44552,19 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB55_4: -; SI-NEXT: v_mov_b32_e32 v39, v11 -; SI-NEXT: v_mov_b32_e32 v33, v10 -; SI-NEXT: v_mov_b32_e32 v49, v2 -; SI-NEXT: v_mov_b32_e32 v48, v3 -; SI-NEXT: v_mov_b32_e32 v52, v37 -; SI-NEXT: v_mov_b32_e32 v37, v29 +; SI-NEXT: v_mov_b32_e32 v49, v11 +; SI-NEXT: v_mov_b32_e32 v48, v10 +; SI-NEXT: v_mov_b32_e32 v39, v3 +; SI-NEXT: v_mov_b32_e32 v36, v4 +; SI-NEXT: v_mov_b32_e32 v52, v35 +; SI-NEXT: v_mov_b32_e32 v35, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v41, v60 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_mov_b32_e32 v41, v58 ; SI-NEXT: v_mov_b32_e32 v40, v56 -; SI-NEXT: v_mov_b32_e32 v29, v37 -; SI-NEXT: v_mov_b32_e32 v37, v52 -; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: v_mov_b32_e32 v30, v35 +; SI-NEXT: v_mov_b32_e32 v35, v52 +; SI-NEXT: v_mov_b32_e32 v29, v44 ; SI-NEXT: s_branch .LBB55_2 ; ; VI-LABEL: bitcast_v56f16_to_v14f64_scalar: @@ -44643,61 +44616,61 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 ; VI-NEXT: v_or_b32_sdwa v14, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v25, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -44718,13 +44691,13 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; VI-NEXT: .LBB55_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -44955,8 +44928,8 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v32 -; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: s_movk_i32 s4, 0x200 +; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v59, 16, v14 ; GFX9-NEXT: v_lshl_or_b32 v15, v58, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v57, 16, v16 @@ -45110,61 +45083,77 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 ; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1] @@ -45175,22 +45164,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB55_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -46966,157 +46939,157 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB57_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s19 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v25 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v57 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v35 ; SI-NEXT: v_mov_b32_e32 v47, v34 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v61 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v58 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s27 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s29 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v1, v11 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v4, v13 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v5 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v5, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v7 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v7, v15 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v8, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v9 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v10 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v13 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v14 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v15 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v17 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v18 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v19 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v20 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v22 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v59 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill @@ -47127,11 +47100,29 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v47, v34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: v_mov_b32_e32 v8, v16 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: v_mov_b32_e32 v7, v15 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: v_mov_b32_e32 v5, v14 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: v_mov_b32_e32 v4, v13 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: v_mov_b32_e32 v2, v12 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: v_mov_b32_e32 v1, v11 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr42 @@ -47149,7 +47140,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 @@ -47203,18 +47194,6 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: .LBB57_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) @@ -47230,63 +47209,69 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v44, v46 ; SI-NEXT: s_cbranch_vccnz .LBB57_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v2 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s16 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s18 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s20 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s22 ; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 ; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s23 ; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 ; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s24 ; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s25 ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s26 ; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s27 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v39, s19 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s19 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v57 @@ -47298,106 +47283,87 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v63 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v48, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v62 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v33, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v58 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v48, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v30 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v54, v27 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v40, v25 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v42, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v44, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -47409,76 +47375,76 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: .LBB57_5: ; %end -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -47491,9 +47457,9 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -47503,8 +47469,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -47514,8 +47480,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -47525,8 +47491,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -47536,7 +47502,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -47547,7 +47513,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -47558,7 +47524,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -47569,8 +47535,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -47580,8 +47546,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -47591,8 +47557,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -47602,8 +47568,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -47613,8 +47579,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -47624,8 +47590,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -47635,8 +47601,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -47646,8 +47612,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -47657,7 +47623,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -47666,7 +47632,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -47724,7 +47690,7 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -49811,582 +49777,565 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v59, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v63, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v47, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v45, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v41, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s28 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v49 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v51 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v51 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v53 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v55 -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v59, v55 +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v21, v42 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v26, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s29 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_mov_b32_e32 v28, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_mov_b32_e32 v51, v23 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 -; SI-NEXT: v_mov_b32_e32 v49, v19 -; SI-NEXT: v_mov_b32_e32 v53, v36 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v36, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 -; SI-NEXT: v_mov_b32_e32 v38, v15 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_mov_b32_e32 v49, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v46 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v19 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v23 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 -; SI-NEXT: v_mov_b32_e32 v37, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_or_b32_e32 v53, v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v40 +; SI-NEXT: v_or_b32_e32 v55, v3, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v36 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_mov_b32_e32 v33, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v53 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v7, v32 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v28, v19 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v28, v28, v15 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v39, v5, v29 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v5, v31, v25 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: v_mov_b32_e32 v50, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v34 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v38 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v28, v28, v21 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v37 -; SI-NEXT: v_or_b32_e32 v38, v31, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v56 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v37, v28, v11 -; SI-NEXT: v_or_b32_e32 v62, v31, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v29, v3, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v5, v5, v17 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v5, v30 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v7, v11 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_or_b32_e32 v5, v5, v9 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: v_or_b32_e32 v35, v28, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v34 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v34, v1, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v31 -; SI-NEXT: v_or_b32_e32 v56, v28, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v31 -; SI-NEXT: v_or_b32_e32 v2, v2, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 -; SI-NEXT: v_or_b32_e32 v4, v4, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v60 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v33 -; SI-NEXT: v_or_b32_e32 v6, v6, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v57 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v60 -; SI-NEXT: v_or_b32_e32 v8, v8, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v57 -; SI-NEXT: v_or_b32_e32 v10, v10, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v63 -; SI-NEXT: v_or_b32_e32 v12, v12, v28 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v36 -; SI-NEXT: v_or_b32_e32 v14, v14, v27 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v60 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v18, v18, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshr_b64 v[52:53], v[17:18], 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v7, v22, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v57 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v33 +; SI-NEXT: v_or_b32_e32 v25, v22, v3 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v23 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_or_b32_e32 v22, v22, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v59 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_lshr_b64 v[54:55], v[21:22], 16 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v51 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v36 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v59 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v58 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v22 +; SI-NEXT: v_or_b32_e32 v4, v4, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v61 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; SI-NEXT: v_or_b32_e32 v6, v6, v22 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v21 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v28 +; SI-NEXT: v_or_b32_e32 v61, v8, v21 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v63 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 +; SI-NEXT: v_or_b32_e32 v10, v10, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v50 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 +; SI-NEXT: v_or_b32_e32 v12, v12, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v31 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v30 -; SI-NEXT: v_or_b32_e32 v30, v28, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v58 -; SI-NEXT: v_lshr_b64 v[41:42], v[29:30], 16 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v36 -; SI-NEXT: v_or_b32_e32 v16, v16, v28 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v61 -; SI-NEXT: v_lshr_b64 v[43:44], v[15:16], 16 -; SI-NEXT: v_mov_b32_e32 v44, v34 -; SI-NEXT: v_mov_b32_e32 v42, v33 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v58 -; SI-NEXT: v_or_b32_e32 v20, v20, v27 -; SI-NEXT: v_lshr_b64 v[45:46], v[19:20], 16 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v61 -; SI-NEXT: v_or_b32_e32 v24, v24, v27 -; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[47:48], v[23:24], 16 -; SI-NEXT: v_mov_b32_e32 v23, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 -; SI-NEXT: v_mov_b32_e32 v7, v63 -; SI-NEXT: v_mov_b32_e32 v34, v56 -; SI-NEXT: v_mov_b32_e32 v56, v62 -; SI-NEXT: v_lshr_b64 v[62:63], v[3:4], 16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v19, v39 -; SI-NEXT: v_mov_b32_e32 v15, v38 -; SI-NEXT: v_lshr_b64 v[39:40], v[25:26], 16 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_or_b32_e32 v31, v22, v21 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v58 +; SI-NEXT: v_or_b32_e32 v33, v23, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; SI-NEXT: v_or_b32_e32 v35, v21, v22 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 ; SI-NEXT: v_lshr_b64 v[50:51], v[13:14], 16 +; SI-NEXT: v_or_b32_e32 v37, v23, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v38 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v24, v22 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v62 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; SI-NEXT: v_or_b32_e32 v16, v16, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v28 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v38 +; SI-NEXT: v_or_b32_e32 v18, v18, v21 +; SI-NEXT: v_lshr_b64 v[46:47], v[17:18], 16 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v62 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_lshr_b64 v[56:57], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v47, v27 +; SI-NEXT: v_mov_b32_e32 v51, v24 +; SI-NEXT: v_lshr_b64 v[23:24], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[60:61], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[1:2], 16 +; SI-NEXT: v_mov_b32_e32 v60, v29 +; SI-NEXT: v_mov_b32_e32 v57, v25 +; SI-NEXT: v_lshr_b64 v[28:29], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v13, v48 ; SI-NEXT: v_lshr_b64 v[48:49], v[11:12], 16 -; SI-NEXT: v_mov_b32_e32 v11, v37 -; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[27:28], v[1:2], 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v17, v55 +; SI-NEXT: v_lshr_b64 v[44:45], v[15:16], 16 +; SI-NEXT: v_mov_b32_e32 v15, v53 +; SI-NEXT: v_lshr_b64 v[42:43], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[30:31], 16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v56 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v44 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v42 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v54 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v59 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 37f049de7a633..a15c9966df62b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -2619,6 +2619,7 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -2630,7 +2631,6 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 @@ -2680,6 +2680,7 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -2691,7 +2692,6 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB11_4 @@ -2741,6 +2741,7 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -2752,7 +2753,6 @@ define inreg <30 x i32> @bitcast_v15f64_to_v30i32_scalar(<15 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB11_4 @@ -3986,6 +3986,13 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 ; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 ; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: s_lshr_b32 s34, s5, 16 ; SI-NEXT: s_lshr_b32 s35, s7, 16 ; SI-NEXT: s_lshr_b32 s36, s9, 16 @@ -4001,57 +4008,54 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s54, s43, 16 ; SI-NEXT: s_lshr_b32 s55, s45, 16 ; SI-NEXT: s_lshr_b32 s64, s47, 16 -; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 -; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 -; SI-NEXT: s_lshr_b64 s[90:91], s[40:41], 16 -; SI-NEXT: s_lshr_b64 s[92:93], s[42:43], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_lshl_b32 s27, s30, 16 -; SI-NEXT: s_and_b32 s29, s46, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s46, 0xffff +; SI-NEXT: s_lshl_b32 s29, s30, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v1, s27 ; SI-NEXT: s_and_b32 s27, s47, 0xffff ; SI-NEXT: s_lshl_b32 s29, s64, 16 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s94, 16 -; SI-NEXT: s_and_b32 s29, s44, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: s_and_b32 s27, s45, 0xffff -; SI-NEXT: s_lshl_b32 s29, s55, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: s_lshl_b32 s27, s92, 16 -; SI-NEXT: s_and_b32 s29, s42, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v5, s27 -; SI-NEXT: s_and_b32 s27, s43, 0xffff -; SI-NEXT: s_lshl_b32 s29, s54, 16 +; SI-NEXT: s_and_b32 s27, s44, 0xffff +; SI-NEXT: s_lshl_b32 s29, s94, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s45, 0xffff +; SI-NEXT: s_lshl_b32 s29, s55, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v6, s27 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s42, 0xffff +; SI-NEXT: s_lshl_b32 s29, s92, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_lshl_b32 s27, s90, 16 -; SI-NEXT: s_and_b32 s29, s40, 0xffff -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_lshl_b32 s29, s54, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s40, 0xffff +; SI-NEXT: s_lshl_b32 s29, s90, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s27 ; SI-NEXT: s_and_b32 s27, s41, 0xffff ; SI-NEXT: s_lshl_b32 s29, s53, 16 -; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -4263,8 +4267,8 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v30i32_to_v60i16_scalar: @@ -5119,8 +5123,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v11 @@ -5364,8 +5368,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v1, v1, v54 -; SI-NEXT: v_or_b32_e32 v2, v2, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v54 ; SI-NEXT: v_or_b32_e32 v3, v3, v58 ; SI-NEXT: v_or_b32_e32 v4, v4, v53 ; SI-NEXT: v_or_b32_e32 v5, v5, v57 @@ -5392,8 +5396,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v27, v27, v32 ; SI-NEXT: v_or_b32_e32 v28, v28, v63 ; SI-NEXT: v_or_b32_e32 v29, v29, v62 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr57 @@ -5525,8 +5529,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_or_b32_e32 v2, v59, v2 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 ; SI-NEXT: v_or_b32_e32 v3, v58, v3 ; SI-NEXT: v_or_b32_e32 v4, v53, v4 ; SI-NEXT: v_or_b32_e32 v5, v57, v5 @@ -5753,10 +5757,10 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v29, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v61 ; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v60 -; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v1, 3, v60 +; VI-NEXT: v_add_u16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v59 ; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -5951,8 +5955,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 @@ -5988,8 +5992,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 @@ -6031,8 +6035,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr62 @@ -6146,8 +6150,8 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] @@ -6440,43 +6444,41 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v60, v16 -; SI-NEXT: v_mov_b32_e32 v53, v14 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v62, v12 -; SI-NEXT: v_mov_b32_e32 v32, v10 -; SI-NEXT: v_mov_b32_e32 v55, v8 -; SI-NEXT: v_mov_b32_e32 v37, v6 -; SI-NEXT: v_mov_b32_e32 v41, v4 -; SI-NEXT: v_mov_b32_e32 v44, v2 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v0 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v46, v10 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v35, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v48, v24 -; SI-NEXT: v_mov_b32_e32 v49, v22 -; SI-NEXT: v_mov_b32_e32 v47, v20 -; SI-NEXT: v_mov_b32_e32 v50, v18 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_mov_b32_e32 v42, v26 +; SI-NEXT: v_mov_b32_e32 v43, v24 +; SI-NEXT: v_mov_b32_e32 v52, v22 +; SI-NEXT: v_mov_b32_e32 v44, v20 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 @@ -6484,15 +6486,15 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill @@ -6504,105 +6506,104 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v7, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v8, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v9, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: v_or_b32_e32 v10, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v17, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v18, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v19, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v16, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v17, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v19, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v45 +; SI-NEXT: v_or_b32_e32 v20, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v43 +; SI-NEXT: v_or_b32_e32 v21, v0, v40 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v26 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v23, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v22, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v23, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v38 +; SI-NEXT: v_or_b32_e32 v24, v0, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v25, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v26, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: v_or_b32_e32 v25, v0, v36 +; SI-NEXT: v_or_b32_e32 v27, v0, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v27, v0, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v29, v0, v33 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_or_b32_e32 v28, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v29, v0, v36 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -6639,125 +6640,126 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: .LBB15_3: ; %end @@ -6780,67 +6782,64 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v45, v43 -; SI-NEXT: v_mov_b32_e32 v44, v42 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v42, v40 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v41, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v50 +; SI-NEXT: v_mov_b32_e32 v51, v49 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v39, v37 ; SI-NEXT: v_mov_b32_e32 v38, v36 ; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v36, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v36, v34 +; SI-NEXT: v_mov_b32_e32 v35, v33 ; SI-NEXT: v_mov_b32_e32 v34, v32 ; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v60 ; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v53 -; SI-NEXT: v_mov_b32_e32 v53, v61 -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v59 -; SI-NEXT: v_mov_b32_e32 v59, v51 -; SI-NEXT: v_mov_b32_e32 v51, v57 -; SI-NEXT: v_mov_b32_e32 v57, v50 -; SI-NEXT: v_mov_b32_e32 v50, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v61, v59 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v59, v57 +; SI-NEXT: v_mov_b32_e32 v57, v47 +; SI-NEXT: v_mov_b32_e32 v47, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_mov_b32_e32 v45, v52 +; SI-NEXT: v_mov_b32_e32 v52, v43 +; SI-NEXT: v_mov_b32_e32 v43, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v50 -; SI-NEXT: v_mov_b32_e32 v50, v57 -; SI-NEXT: v_mov_b32_e32 v57, v51 -; SI-NEXT: v_mov_b32_e32 v51, v59 -; SI-NEXT: v_mov_b32_e32 v59, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 ; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v30, v43 +; SI-NEXT: v_mov_b32_e32 v43, v52 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v47 +; SI-NEXT: v_mov_b32_e32 v47, v57 +; SI-NEXT: v_mov_b32_e32 v57, v59 +; SI-NEXT: v_mov_b32_e32 v59, v61 +; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 ; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v35 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v34, v36 ; SI-NEXT: v_mov_b32_e32 v35, v37 ; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v37, v40 -; SI-NEXT: v_mov_b32_e32 v38, v41 -; SI-NEXT: v_mov_b32_e32 v40, v42 -; SI-NEXT: v_mov_b32_e32 v41, v43 -; SI-NEXT: v_mov_b32_e32 v42, v44 -; SI-NEXT: v_mov_b32_e32 v43, v45 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v48, v50 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_mov_b32_e32 v50, v40 +; SI-NEXT: v_mov_b32_e32 v51, v41 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v60i16_to_v30i32_scalar: @@ -6888,71 +6887,71 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -6971,125 +6970,125 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB15_3 ; VI-NEXT: .LBB15_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff ; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s17, 0xffff ; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff ; VI-NEXT: s_lshl_b32 s17, s41, 16 ; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v38 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s16, s17, s16 ; VI-NEXT: s_and_b32 s17, s19, 0xffff ; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s17, s18, s17 ; VI-NEXT: s_and_b32 s18, s20, 0xffff ; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s15, s15, s18 ; VI-NEXT: s_and_b32 s18, s21, 0xffff ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s4, s4, 0x30000 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 @@ -7104,7 +7103,7 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: s_add_i32 s8, s8, 0x30000 ; VI-NEXT: s_add_i32 s7, s7, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -7211,6 +7210,8 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 @@ -7236,10 +7237,8 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -7257,8 +7256,8 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v13, s19 ; GFX9-NEXT: s_cbranch_execnz .LBB15_3 ; GFX9-NEXT: .LBB15_2: ; %cmp.true -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v54 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 @@ -7273,8 +7272,8 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v63, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 @@ -7289,8 +7288,6 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 ; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 ; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -7305,6 +7302,8 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -7435,61 +7434,77 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3 ; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] @@ -7502,22 +7517,6 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB15_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -7840,6 +7839,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -7862,33 +7863,33 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -7910,70 +7911,69 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 @@ -7982,6 +7982,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -8001,7 +8002,6 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 @@ -8011,32 +8011,32 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v61, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 @@ -8075,34 +8075,40 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB16_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_mov_b32_e32 v63, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 @@ -8117,36 +8123,32 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 @@ -8173,65 +8175,62 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_mov_b32_e32 v61, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_mov_b32_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 ; SI-NEXT: v_mov_b32_e32 v37, v27 ; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_mov_b32_e32 v34, v29 +; SI-NEXT: v_mov_b32_e32 v33, v29 ; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: v_mov_b32_e32 v63, v25 -; SI-NEXT: v_mov_b32_e32 v59, v26 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v61, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) @@ -8254,25 +8253,23 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 @@ -8281,7 +8278,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 @@ -8290,7 +8287,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -8299,7 +8296,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -8308,7 +8305,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -8317,7 +8314,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -8326,16 +8323,16 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -8344,7 +8341,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -8353,7 +8350,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -8362,7 +8359,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 @@ -8372,8 +8369,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -8383,8 +8380,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -8394,8 +8391,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -8405,8 +8402,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -8416,8 +8413,8 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -8427,7 +8424,7 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -8437,61 +8434,63 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -10965,23 +10964,29 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 @@ -11006,20 +11011,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload @@ -11027,175 +11021,178 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload @@ -11445,10 +11442,10 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v29, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 -; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v1, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v60 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -12140,48 +12137,43 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 @@ -12191,10 +12183,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 @@ -12204,37 +12195,41 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v54 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 @@ -12246,7 +12241,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 @@ -12273,46 +12268,48 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_or_b32_e32 v12, v45, v12 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 ; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -12321,328 +12318,313 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 ; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 ; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v62, v36 +; SI-NEXT: v_mov_b32_e32 v60, v35 ; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_mov_b32_e32 v47, v63 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 ; SI-NEXT: v_or_b32_e32 v14, v55, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 ; SI-NEXT: v_or_b32_e32 v20, v53, v20 ; SI-NEXT: v_or_b32_e32 v21, v51, v21 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: v_or_b32_e32 v22, v31, v22 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v32, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_or_b32_e32 v17, v32, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: v_or_b32_e32 v16, v45, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_or_b32_e32 v9, v39, v9 -; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v36, v39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_or_b32_e32 v9, v39, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v54, v29 ; SI-NEXT: v_mov_b32_e32 v54, v32 ; SI-NEXT: s_branch .LBB19_3 ; SI-NEXT: .LBB19_2: +; SI-NEXT: v_mov_b32_e32 v48, v1 ; SI-NEXT: v_mov_b32_e32 v54, v53 ; SI-NEXT: v_mov_b32_e32 v53, v52 ; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: v_mov_b32_e32 v51, v30 +; SI-NEXT: v_mov_b32_e32 v50, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v30, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v62, v36 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v30, v51 ; SI-NEXT: v_mov_b32_e32 v51, v52 ; SI-NEXT: v_mov_b32_e32 v52, v53 ; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v57, v39 ; SI-NEXT: v_mov_b32_e32 v56, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: v_mov_b32_e32 v47, v63 +; SI-NEXT: v_mov_b32_e32 v44, v43 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: v_mov_b32_e32 v31, v50 ; SI-NEXT: .LBB19_3: ; %Flow ; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB19_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 ; SI-NEXT: v_mov_b32_e32 v55, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 @@ -12652,30 +12634,38 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -12787,71 +12777,71 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -12872,13 +12862,13 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; VI-NEXT: .LBB19_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -13067,6 +13057,8 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 @@ -13092,10 +13084,8 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -13129,9 +13119,9 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 @@ -13293,61 +13283,77 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB19_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB19_3 ; GFX11-TRUE16-NEXT: .LBB19_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] @@ -13360,22 +13366,6 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB19_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -15253,6 +15243,7 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -15264,7 +15255,6 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 @@ -15314,6 +15304,7 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -15325,7 +15316,6 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB27_4 @@ -15375,6 +15365,7 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -15386,7 +15377,6 @@ define inreg <30 x float> @bitcast_v15f64_to_v30f32_scalar(<15 x double> inreg % ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 @@ -16455,6 +16445,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v27, s16 ; SI-NEXT: v_mov_b32_e32 v28, s17 ; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v30, s19 ; SI-NEXT: v_mov_b32_e32 v25, s20 ; SI-NEXT: v_mov_b32_e32 v26, s21 @@ -16464,38 +16455,37 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v22, s25 ; SI-NEXT: v_mov_b32_e32 v19, s26 ; SI-NEXT: v_mov_b32_e32 v20, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v17, s28 ; SI-NEXT: v_mov_b32_e32 v18, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshr_b64 v[31:32], v[15:16], 16 ; SI-NEXT: v_lshr_b64 v[32:33], v[13:14], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[17:18], 16 ; SI-NEXT: v_lshr_b64 v[33:34], v[11:12], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[17:18], 16 ; SI-NEXT: v_lshr_b64 v[34:35], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 ; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16 ; SI-NEXT: v_lshr_b64 v[36:37], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 ; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 ; SI-NEXT: v_lshr_b64 v[53:54], v[29:30], 16 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 @@ -16504,23 +16494,22 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v30 ; SI-NEXT: v_lshr_b64 v[54:55], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 @@ -16575,36 +16564,36 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v54 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v27, v39 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_or_b32_e32 v27, v27, v54 ; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v53 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -16613,9 +16602,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v52 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v25, v25, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen @@ -16625,9 +16614,9 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_add_i32_e32 v26, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v51 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v23, v23, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen @@ -16769,26 +16758,25 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr52 @@ -16801,22 +16789,22 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v30f32_to_v60i16_scalar: @@ -17757,8 +17745,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v11 @@ -18002,8 +17990,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v1, v1, v54 -; SI-NEXT: v_or_b32_e32 v2, v2, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v54 ; SI-NEXT: v_or_b32_e32 v3, v3, v58 ; SI-NEXT: v_or_b32_e32 v4, v4, v53 ; SI-NEXT: v_or_b32_e32 v5, v5, v57 @@ -18030,8 +18018,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v27, v27, v32 ; SI-NEXT: v_or_b32_e32 v28, v28, v63 ; SI-NEXT: v_or_b32_e32 v29, v29, v62 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr57 @@ -18163,8 +18151,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_or_b32_e32 v2, v59, v2 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 ; SI-NEXT: v_or_b32_e32 v3, v58, v3 ; SI-NEXT: v_or_b32_e32 v4, v53, v4 ; SI-NEXT: v_or_b32_e32 v5, v57, v5 @@ -18391,10 +18379,10 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v29, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v61 ; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v60 -; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v1, 3, v60 +; VI-NEXT: v_add_u16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v59 ; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -18589,8 +18577,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 @@ -18626,8 +18614,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 @@ -18669,8 +18657,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr62 @@ -18784,8 +18772,8 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] @@ -19078,43 +19066,41 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v60, v16 -; SI-NEXT: v_mov_b32_e32 v53, v14 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v62, v12 -; SI-NEXT: v_mov_b32_e32 v32, v10 -; SI-NEXT: v_mov_b32_e32 v55, v8 -; SI-NEXT: v_mov_b32_e32 v37, v6 -; SI-NEXT: v_mov_b32_e32 v41, v4 -; SI-NEXT: v_mov_b32_e32 v44, v2 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v0 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v46, v10 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v35, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v48, v24 -; SI-NEXT: v_mov_b32_e32 v49, v22 -; SI-NEXT: v_mov_b32_e32 v47, v20 -; SI-NEXT: v_mov_b32_e32 v50, v18 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_mov_b32_e32 v42, v26 +; SI-NEXT: v_mov_b32_e32 v43, v24 +; SI-NEXT: v_mov_b32_e32 v52, v22 +; SI-NEXT: v_mov_b32_e32 v44, v20 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 @@ -19122,15 +19108,15 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill @@ -19142,105 +19128,104 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v7, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v8, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v9, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: v_or_b32_e32 v10, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v17, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v18, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v19, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v16, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v17, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v19, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v45 +; SI-NEXT: v_or_b32_e32 v20, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v43 +; SI-NEXT: v_or_b32_e32 v21, v0, v40 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v26 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v23, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v22, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v23, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v38 +; SI-NEXT: v_or_b32_e32 v24, v0, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v25, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v26, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: v_or_b32_e32 v25, v0, v36 +; SI-NEXT: v_or_b32_e32 v27, v0, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v27, v0, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v29, v0, v33 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_or_b32_e32 v28, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v29, v0, v36 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -19277,125 +19262,126 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: .LBB31_3: ; %end @@ -19418,67 +19404,64 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v45, v43 -; SI-NEXT: v_mov_b32_e32 v44, v42 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v42, v40 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v41, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v50 +; SI-NEXT: v_mov_b32_e32 v51, v49 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v39, v37 ; SI-NEXT: v_mov_b32_e32 v38, v36 ; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v36, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v36, v34 +; SI-NEXT: v_mov_b32_e32 v35, v33 ; SI-NEXT: v_mov_b32_e32 v34, v32 ; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v60 ; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v53 -; SI-NEXT: v_mov_b32_e32 v53, v61 -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v59 -; SI-NEXT: v_mov_b32_e32 v59, v51 -; SI-NEXT: v_mov_b32_e32 v51, v57 -; SI-NEXT: v_mov_b32_e32 v57, v50 -; SI-NEXT: v_mov_b32_e32 v50, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v61, v59 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v59, v57 +; SI-NEXT: v_mov_b32_e32 v57, v47 +; SI-NEXT: v_mov_b32_e32 v47, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_mov_b32_e32 v45, v52 +; SI-NEXT: v_mov_b32_e32 v52, v43 +; SI-NEXT: v_mov_b32_e32 v43, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v50 -; SI-NEXT: v_mov_b32_e32 v50, v57 -; SI-NEXT: v_mov_b32_e32 v57, v51 -; SI-NEXT: v_mov_b32_e32 v51, v59 -; SI-NEXT: v_mov_b32_e32 v59, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 ; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v30, v43 +; SI-NEXT: v_mov_b32_e32 v43, v52 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v47 +; SI-NEXT: v_mov_b32_e32 v47, v57 +; SI-NEXT: v_mov_b32_e32 v57, v59 +; SI-NEXT: v_mov_b32_e32 v59, v61 +; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 ; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v35 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v34, v36 ; SI-NEXT: v_mov_b32_e32 v35, v37 ; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v37, v40 -; SI-NEXT: v_mov_b32_e32 v38, v41 -; SI-NEXT: v_mov_b32_e32 v40, v42 -; SI-NEXT: v_mov_b32_e32 v41, v43 -; SI-NEXT: v_mov_b32_e32 v42, v44 -; SI-NEXT: v_mov_b32_e32 v43, v45 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v48, v50 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_mov_b32_e32 v50, v40 +; SI-NEXT: v_mov_b32_e32 v51, v41 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v60i16_to_v30f32_scalar: @@ -19526,71 +19509,71 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -19609,125 +19592,125 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB31_3 ; VI-NEXT: .LBB31_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff ; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s17, 0xffff ; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff ; VI-NEXT: s_lshl_b32 s17, s41, 16 ; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v38 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s16, s17, s16 ; VI-NEXT: s_and_b32 s17, s19, 0xffff ; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s17, s18, s17 ; VI-NEXT: s_and_b32 s18, s20, 0xffff ; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s15, s15, s18 ; VI-NEXT: s_and_b32 s18, s21, 0xffff ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s4, s4, 0x30000 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 @@ -19742,7 +19725,7 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; VI-NEXT: s_add_i32 s8, s8, 0x30000 ; VI-NEXT: s_add_i32 s7, s7, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -19849,6 +19832,8 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 @@ -19874,10 +19859,8 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -19895,8 +19878,8 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v13, s19 ; GFX9-NEXT: s_cbranch_execnz .LBB31_3 ; GFX9-NEXT: .LBB31_2: ; %cmp.true -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v54 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 @@ -19911,8 +19894,8 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v63, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 @@ -19927,8 +19910,6 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 ; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 ; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -19943,6 +19924,8 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -20073,61 +20056,77 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB31_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB31_3 ; GFX11-TRUE16-NEXT: .LBB31_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] @@ -20140,22 +20139,6 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB31_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -20478,6 +20461,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -20500,33 +20485,33 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -20548,70 +20533,69 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB32_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 @@ -20620,6 +20604,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -20639,7 +20624,6 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 @@ -20649,32 +20633,32 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v61, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 @@ -20713,34 +20697,40 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB32_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_mov_b32_e32 v63, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 @@ -20755,36 +20745,32 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 @@ -20811,65 +20797,62 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_mov_b32_e32 v61, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_mov_b32_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 ; SI-NEXT: v_mov_b32_e32 v37, v27 ; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_mov_b32_e32 v34, v29 +; SI-NEXT: v_mov_b32_e32 v33, v29 ; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: v_mov_b32_e32 v63, v25 -; SI-NEXT: v_mov_b32_e32 v59, v26 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v61, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: .LBB32_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) @@ -20892,25 +20875,23 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 @@ -20919,7 +20900,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 @@ -20928,7 +20909,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -20937,7 +20918,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -20946,7 +20927,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -20955,7 +20936,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -20964,16 +20945,16 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -20982,7 +20963,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -20991,7 +20972,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -21000,7 +20981,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 @@ -21010,8 +20991,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -21021,8 +21002,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -21032,8 +21013,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -21043,8 +21024,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -21054,8 +21035,8 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -21065,7 +21046,7 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -21075,61 +21056,63 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -21803,74 +21786,128 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB33_4 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v14 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v35 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v11 ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v39 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v51 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v35 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v37 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v7 ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v39 ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v49 ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v50 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v2 ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v29 ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v34 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v17 @@ -21879,141 +21916,87 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v38 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v18, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v15 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v13 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v12 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v11 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v10 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v5 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v4 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v2 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v32 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v31 ; SI-NEXT: s_cbranch_execnz .LBB33_3 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v9 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v9 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v8 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v47 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v45 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v3 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v31 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v31 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v50 ; SI-NEXT: v_add_f32_e32 v26, 1.0, v51 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v49 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v39 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v38 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v37 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v36 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v35 ; SI-NEXT: v_add_f32_e32 v34, 1.0, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 @@ -22021,27 +22004,29 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v32 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 @@ -22056,47 +22041,46 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -22107,11 +22091,10 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -22138,7 +22121,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v24 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -22167,7 +22150,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -22195,7 +22178,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -22278,7 +22261,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -22321,7 +22304,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -22337,7 +22320,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 @@ -22399,7 +22382,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: ; kill: killed $vgpr53 ; SI-NEXT: ; kill: killed $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr44 @@ -22414,14 +22397,14 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr45 @@ -22429,10 +22412,10 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; kill: killed $vgpr52 -; SI-NEXT: ; kill: killed $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; kill: killed $vgpr52 +; SI-NEXT: ; kill: killed $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; kill: killed $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 @@ -23802,23 +23785,29 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 @@ -23843,20 +23832,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload @@ -23864,175 +23842,178 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload @@ -24282,10 +24263,10 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v29, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 -; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v1, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v60 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -24977,48 +24958,43 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 @@ -25028,10 +25004,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 @@ -25041,37 +25016,41 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v54 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 @@ -25083,7 +25062,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 @@ -25110,46 +25089,48 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_or_b32_e32 v12, v45, v12 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 ; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -25158,328 +25139,313 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 ; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 ; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v62, v36 +; SI-NEXT: v_mov_b32_e32 v60, v35 ; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_mov_b32_e32 v47, v63 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 ; SI-NEXT: v_or_b32_e32 v14, v55, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 ; SI-NEXT: v_or_b32_e32 v20, v53, v20 ; SI-NEXT: v_or_b32_e32 v21, v51, v21 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: v_or_b32_e32 v22, v31, v22 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v32, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_or_b32_e32 v17, v32, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: v_or_b32_e32 v16, v45, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_or_b32_e32 v9, v39, v9 -; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v36, v39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_or_b32_e32 v9, v39, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v54, v29 ; SI-NEXT: v_mov_b32_e32 v54, v32 ; SI-NEXT: s_branch .LBB35_3 ; SI-NEXT: .LBB35_2: +; SI-NEXT: v_mov_b32_e32 v48, v1 ; SI-NEXT: v_mov_b32_e32 v54, v53 ; SI-NEXT: v_mov_b32_e32 v53, v52 ; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: v_mov_b32_e32 v51, v30 +; SI-NEXT: v_mov_b32_e32 v50, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v30, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v62, v36 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v30, v51 ; SI-NEXT: v_mov_b32_e32 v51, v52 ; SI-NEXT: v_mov_b32_e32 v52, v53 ; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v57, v39 ; SI-NEXT: v_mov_b32_e32 v56, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: v_mov_b32_e32 v47, v63 +; SI-NEXT: v_mov_b32_e32 v44, v43 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: v_mov_b32_e32 v31, v50 ; SI-NEXT: .LBB35_3: ; %Flow ; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB35_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 ; SI-NEXT: v_mov_b32_e32 v55, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 @@ -25489,30 +25455,38 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -25624,71 +25598,71 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -25709,13 +25683,13 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; VI-NEXT: .LBB35_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -25904,6 +25878,8 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 @@ -25929,10 +25905,8 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -25966,9 +25940,9 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 @@ -26130,61 +26104,77 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB35_3 ; GFX11-TRUE16-NEXT: .LBB35_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] @@ -26197,22 +26187,6 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB35_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -27160,6 +27134,7 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v15, v1 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_mov_b32_e32 v3, s19 @@ -27171,7 +27146,6 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: v_mov_b32_e32 v10, s26 ; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v12, s28 ; SI-NEXT: v_mov_b32_e32 v13, s29 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 @@ -27221,6 +27195,7 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v15, v1 ; VI-NEXT: v_mov_b32_e32 v14, v0 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v1, s17 ; VI-NEXT: v_mov_b32_e32 v2, s18 ; VI-NEXT: v_mov_b32_e32 v3, s19 @@ -27232,7 +27207,6 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v9, s25 ; VI-NEXT: v_mov_b32_e32 v10, s26 ; VI-NEXT: v_mov_b32_e32 v11, s27 -; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_mov_b32_e32 v12, s28 ; VI-NEXT: v_mov_b32_e32 v13, s29 ; VI-NEXT: s_cbranch_scc0 .LBB39_4 @@ -27282,6 +27256,7 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v15, v1 ; GFX9-NEXT: v_mov_b32_e32 v14, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 @@ -27293,7 +27268,6 @@ define inreg <15 x i64> @bitcast_v15f64_to_v15i64_scalar(<15 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v9, s25 ; GFX9-NEXT: v_mov_b32_e32 v10, s26 ; GFX9-NEXT: v_mov_b32_e32 v11, s27 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: v_mov_b32_e32 v12, s28 ; GFX9-NEXT: v_mov_b32_e32 v13, s29 ; GFX9-NEXT: s_cbranch_scc0 .LBB39_4 @@ -28566,49 +28540,53 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 ; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_lshl_b32 s27, s30, 16 -; SI-NEXT: s_and_b32 s29, s46, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 +; SI-NEXT: s_and_b32 s27, s46, 0xffff +; SI-NEXT: s_lshl_b32 s29, s30, 16 +; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v1, s27 ; SI-NEXT: s_and_b32 s27, s47, 0xffff ; SI-NEXT: s_lshl_b32 s29, s64, 16 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: v_mov_b32_e32 v2, s27 -; SI-NEXT: s_lshl_b32 s27, s94, 16 -; SI-NEXT: s_and_b32 s29, s44, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v3, s27 -; SI-NEXT: s_and_b32 s27, s45, 0xffff -; SI-NEXT: s_lshl_b32 s29, s55, 16 -; SI-NEXT: s_or_b32 s27, s27, s29 -; SI-NEXT: v_mov_b32_e32 v4, s27 -; SI-NEXT: s_lshl_b32 s27, s92, 16 -; SI-NEXT: s_and_b32 s29, s42, 0xffff -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: v_mov_b32_e32 v5, s27 -; SI-NEXT: s_and_b32 s27, s43, 0xffff -; SI-NEXT: s_lshl_b32 s29, s54, 16 +; SI-NEXT: s_and_b32 s27, s44, 0xffff +; SI-NEXT: s_lshl_b32 s29, s94, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s45, 0xffff +; SI-NEXT: s_lshl_b32 s29, s55, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v6, s27 -; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s42, 0xffff +; SI-NEXT: s_lshl_b32 s29, s92, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_lshl_b32 s27, s90, 16 -; SI-NEXT: s_and_b32 s29, s40, 0xffff -; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s43, 0xffff +; SI-NEXT: s_lshl_b32 s29, s54, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_or_b32 s27, s29, s27 -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s27 +; SI-NEXT: s_and_b32 s27, s40, 0xffff +; SI-NEXT: s_lshl_b32 s29, s90, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 -; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: s_or_b32 s27, s27, s29 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s27 ; SI-NEXT: s_and_b32 s27, s41, 0xffff ; SI-NEXT: s_lshl_b32 s29, s53, 16 -; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 ; SI-NEXT: s_or_b32 s27, s27, s29 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -28820,8 +28798,8 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v15i64_to_v60i16_scalar: @@ -29676,8 +29654,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v11 @@ -29921,8 +29899,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v1, v1, v54 -; SI-NEXT: v_or_b32_e32 v2, v2, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v54 ; SI-NEXT: v_or_b32_e32 v3, v3, v58 ; SI-NEXT: v_or_b32_e32 v4, v4, v53 ; SI-NEXT: v_or_b32_e32 v5, v5, v57 @@ -29949,8 +29927,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v27, v27, v32 ; SI-NEXT: v_or_b32_e32 v28, v28, v63 ; SI-NEXT: v_or_b32_e32 v29, v29, v62 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr57 @@ -30082,8 +30060,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_or_b32_e32 v2, v59, v2 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 ; SI-NEXT: v_or_b32_e32 v3, v58, v3 ; SI-NEXT: v_or_b32_e32 v4, v53, v4 ; SI-NEXT: v_or_b32_e32 v5, v57, v5 @@ -30310,10 +30288,10 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v29, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v61 ; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v60 -; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v1, 3, v60 +; VI-NEXT: v_add_u16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v59 ; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -30508,8 +30486,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 @@ -30545,8 +30523,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 @@ -30588,8 +30566,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr62 @@ -30703,8 +30681,8 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] @@ -30997,43 +30975,41 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v60, v16 -; SI-NEXT: v_mov_b32_e32 v53, v14 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v62, v12 -; SI-NEXT: v_mov_b32_e32 v32, v10 -; SI-NEXT: v_mov_b32_e32 v55, v8 -; SI-NEXT: v_mov_b32_e32 v37, v6 -; SI-NEXT: v_mov_b32_e32 v41, v4 -; SI-NEXT: v_mov_b32_e32 v44, v2 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v0 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v46, v10 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v35, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v48, v24 -; SI-NEXT: v_mov_b32_e32 v49, v22 -; SI-NEXT: v_mov_b32_e32 v47, v20 -; SI-NEXT: v_mov_b32_e32 v50, v18 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_mov_b32_e32 v42, v26 +; SI-NEXT: v_mov_b32_e32 v43, v24 +; SI-NEXT: v_mov_b32_e32 v52, v22 +; SI-NEXT: v_mov_b32_e32 v44, v20 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 @@ -31041,15 +31017,15 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill @@ -31061,105 +31037,104 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v7, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v8, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v9, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: v_or_b32_e32 v10, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v17, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v18, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v19, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v16, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v17, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v19, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v45 +; SI-NEXT: v_or_b32_e32 v20, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v43 +; SI-NEXT: v_or_b32_e32 v21, v0, v40 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v26 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v23, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v22, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v23, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v38 +; SI-NEXT: v_or_b32_e32 v24, v0, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v25, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v26, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: v_or_b32_e32 v25, v0, v36 +; SI-NEXT: v_or_b32_e32 v27, v0, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v27, v0, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v29, v0, v33 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_or_b32_e32 v28, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v29, v0, v36 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -31196,125 +31171,126 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: .LBB43_3: ; %end @@ -31337,67 +31313,64 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v45, v43 -; SI-NEXT: v_mov_b32_e32 v44, v42 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v42, v40 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v41, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v50 +; SI-NEXT: v_mov_b32_e32 v51, v49 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v39, v37 ; SI-NEXT: v_mov_b32_e32 v38, v36 ; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v36, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v36, v34 +; SI-NEXT: v_mov_b32_e32 v35, v33 ; SI-NEXT: v_mov_b32_e32 v34, v32 ; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v60 ; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v53 -; SI-NEXT: v_mov_b32_e32 v53, v61 -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v59 -; SI-NEXT: v_mov_b32_e32 v59, v51 -; SI-NEXT: v_mov_b32_e32 v51, v57 -; SI-NEXT: v_mov_b32_e32 v57, v50 -; SI-NEXT: v_mov_b32_e32 v50, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v61, v59 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v59, v57 +; SI-NEXT: v_mov_b32_e32 v57, v47 +; SI-NEXT: v_mov_b32_e32 v47, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_mov_b32_e32 v45, v52 +; SI-NEXT: v_mov_b32_e32 v52, v43 +; SI-NEXT: v_mov_b32_e32 v43, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v50 -; SI-NEXT: v_mov_b32_e32 v50, v57 -; SI-NEXT: v_mov_b32_e32 v57, v51 -; SI-NEXT: v_mov_b32_e32 v51, v59 -; SI-NEXT: v_mov_b32_e32 v59, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 ; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v30, v43 +; SI-NEXT: v_mov_b32_e32 v43, v52 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v47 +; SI-NEXT: v_mov_b32_e32 v47, v57 +; SI-NEXT: v_mov_b32_e32 v57, v59 +; SI-NEXT: v_mov_b32_e32 v59, v61 +; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 ; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v35 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v34, v36 ; SI-NEXT: v_mov_b32_e32 v35, v37 ; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v37, v40 -; SI-NEXT: v_mov_b32_e32 v38, v41 -; SI-NEXT: v_mov_b32_e32 v40, v42 -; SI-NEXT: v_mov_b32_e32 v41, v43 -; SI-NEXT: v_mov_b32_e32 v42, v44 -; SI-NEXT: v_mov_b32_e32 v43, v45 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v48, v50 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_mov_b32_e32 v50, v40 +; SI-NEXT: v_mov_b32_e32 v51, v41 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v60i16_to_v15i64_scalar: @@ -31445,71 +31418,71 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -31528,125 +31501,125 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB43_3 ; VI-NEXT: .LBB43_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff ; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s17, 0xffff ; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff ; VI-NEXT: s_lshl_b32 s17, s41, 16 ; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v38 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s16, s17, s16 ; VI-NEXT: s_and_b32 s17, s19, 0xffff ; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s17, s18, s17 ; VI-NEXT: s_and_b32 s18, s20, 0xffff ; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s15, s15, s18 ; VI-NEXT: s_and_b32 s18, s21, 0xffff ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s4, s4, 0x30000 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 @@ -31661,7 +31634,7 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; VI-NEXT: s_add_i32 s8, s8, 0x30000 ; VI-NEXT: s_add_i32 s7, s7, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -31768,6 +31741,8 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 @@ -31793,10 +31768,8 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -31814,8 +31787,8 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v13, s19 ; GFX9-NEXT: s_cbranch_execnz .LBB43_3 ; GFX9-NEXT: .LBB43_2: ; %cmp.true -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v54 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 @@ -31830,8 +31803,8 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v63, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 @@ -31846,8 +31819,6 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 ; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 ; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -31862,6 +31833,8 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -31992,61 +31965,77 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB43_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB43_3 ; GFX11-TRUE16-NEXT: .LBB43_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] @@ -32059,22 +32048,6 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB43_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -32397,6 +32370,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -32419,33 +32394,33 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -32467,70 +32442,69 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 @@ -32539,6 +32513,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -32558,7 +32533,6 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 @@ -32568,32 +32542,32 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v25 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v61, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 @@ -32634,8 +32608,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 @@ -32654,57 +32628,59 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 ; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v32 +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 ; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 ; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_mov_b32_e32 v63, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 @@ -32731,65 +32707,62 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v61 -; SI-NEXT: v_mov_b32_e32 v61, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 +; SI-NEXT: v_mov_b32_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 ; SI-NEXT: v_mov_b32_e32 v37, v27 ; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_mov_b32_e32 v34, v29 +; SI-NEXT: v_mov_b32_e32 v33, v29 ; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: v_mov_b32_e32 v63, v25 -; SI-NEXT: v_mov_b32_e32 v59, v26 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v61, v26 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) @@ -32812,25 +32785,23 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 @@ -32839,7 +32810,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 @@ -32848,7 +32819,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -32857,7 +32828,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -32866,7 +32837,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -32875,7 +32846,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -32884,16 +32855,16 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -32902,7 +32873,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -32911,7 +32882,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -32920,7 +32891,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 @@ -32930,8 +32901,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -32941,8 +32912,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -32952,8 +32923,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -32963,8 +32934,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -32974,8 +32945,8 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -32985,7 +32956,7 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -32995,61 +32966,63 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -35539,23 +35512,29 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 @@ -35580,20 +35559,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload @@ -35601,175 +35569,178 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload @@ -36019,10 +35990,10 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v29, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 -; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v1, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v60 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -36714,48 +36685,43 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 @@ -36765,10 +36731,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 @@ -36778,37 +36743,41 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v54 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 @@ -36820,7 +36789,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 @@ -36847,46 +36816,48 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_or_b32_e32 v12, v45, v12 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 ; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -36895,328 +36866,313 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 ; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 ; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v62, v36 +; SI-NEXT: v_mov_b32_e32 v60, v35 ; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_mov_b32_e32 v47, v63 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 ; SI-NEXT: v_or_b32_e32 v14, v55, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 ; SI-NEXT: v_or_b32_e32 v20, v53, v20 ; SI-NEXT: v_or_b32_e32 v21, v51, v21 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: v_or_b32_e32 v22, v31, v22 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v32, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_or_b32_e32 v17, v32, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: v_or_b32_e32 v16, v45, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_or_b32_e32 v9, v39, v9 -; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v36, v39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_or_b32_e32 v9, v39, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v54, v29 ; SI-NEXT: v_mov_b32_e32 v54, v32 ; SI-NEXT: s_branch .LBB47_3 ; SI-NEXT: .LBB47_2: +; SI-NEXT: v_mov_b32_e32 v48, v1 ; SI-NEXT: v_mov_b32_e32 v54, v53 ; SI-NEXT: v_mov_b32_e32 v53, v52 ; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: v_mov_b32_e32 v51, v30 +; SI-NEXT: v_mov_b32_e32 v50, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v30, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v62, v36 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v30, v51 ; SI-NEXT: v_mov_b32_e32 v51, v52 ; SI-NEXT: v_mov_b32_e32 v52, v53 ; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v57, v39 ; SI-NEXT: v_mov_b32_e32 v56, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: v_mov_b32_e32 v47, v63 +; SI-NEXT: v_mov_b32_e32 v44, v43 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: v_mov_b32_e32 v31, v50 ; SI-NEXT: .LBB47_3: ; %Flow ; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB47_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 ; SI-NEXT: v_mov_b32_e32 v55, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 @@ -37226,30 +37182,38 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -37361,71 +37325,71 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -37446,13 +37410,13 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; VI-NEXT: .LBB47_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -37641,6 +37605,8 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 @@ -37666,10 +37632,8 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -37703,9 +37667,9 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 @@ -37867,61 +37831,77 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB47_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB47_3 ; GFX11-TRUE16-NEXT: .LBB47_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] @@ -37934,22 +37914,6 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB47_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -38381,9 +38345,9 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 ; SI-NEXT: .LBB48_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v56 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -39165,6 +39129,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v27, s16 ; SI-NEXT: v_mov_b32_e32 v28, s17 ; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v30, s19 ; SI-NEXT: v_mov_b32_e32 v25, s20 ; SI-NEXT: v_mov_b32_e32 v26, s21 @@ -39174,210 +39139,207 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v22, s25 ; SI-NEXT: v_mov_b32_e32 v19, s26 ; SI-NEXT: v_mov_b32_e32 v20, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v17, s28 ; SI-NEXT: v_mov_b32_e32 v18, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshr_b64 v[31:32], v[15:16], 16 ; SI-NEXT: v_lshr_b64 v[32:33], v[13:14], 16 ; SI-NEXT: v_lshr_b64 v[33:34], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[17:18], 16 ; SI-NEXT: v_lshr_b64 v[34:35], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 ; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16 ; SI-NEXT: v_lshr_b64 v[36:37], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 ; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[53:54], v[25:26], 16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshr_b64 v[52:53], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[29:30], 16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v28 -; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[54:55], v[29:30], 16 -; SI-NEXT: v_lshr_b64 v[39:40], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v30 +; SI-NEXT: v_lshr_b64 v[54:55], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; SI-NEXT: v_lshr_b64 v[31:32], v[15:16], 16 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; SI-NEXT: v_lshr_b64 v[32:33], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[17:18], 16 ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; SI-NEXT: v_lshr_b64 v[33:34], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; SI-NEXT: v_lshr_b64 v[34:35], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_lshr_b64 v[36:37], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[51:52], v[1:2], 16 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_lshr_b64 v[36:37], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[25:26], 16 ; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[52:53], v[23:24], 16 -; SI-NEXT: v_lshr_b64 v[38:39], v[17:18], 16 -; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 -; SI-NEXT: v_lshr_b64 v[53:54], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[54:55], v[29:30], 16 -; SI-NEXT: v_lshr_b64 v[39:40], v[27:28], 16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; SI-NEXT: v_lshr_b64 v[53:54], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v27, v39 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_or_b32_e32 v27, v27, v54 ; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v54 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v53 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v61 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v52 ; SI-NEXT: v_or_b32_e32 v25, v25, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v60 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_add_i32_e32 v26, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v52 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v51 ; SI-NEXT: v_or_b32_e32 v23, v23, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v59 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 ; SI-NEXT: v_or_b32_e32 v21, v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v58 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v49 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v57 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v56 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -39389,7 +39351,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -39401,7 +39363,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -39413,7 +39375,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -39425,7 +39387,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -39437,7 +39399,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -39449,7 +39411,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -39461,80 +39423,78 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v15f64_to_v60i16_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_mov_b32_e32 v17, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v29, s18 -; VI-NEXT: v_mov_b32_e32 v30, s19 -; VI-NEXT: v_mov_b32_e32 v27, s20 -; VI-NEXT: v_mov_b32_e32 v28, s21 -; VI-NEXT: v_mov_b32_e32 v25, s22 -; VI-NEXT: v_mov_b32_e32 v26, s23 -; VI-NEXT: v_mov_b32_e32 v23, s24 -; VI-NEXT: v_mov_b32_e32 v24, s25 -; VI-NEXT: v_mov_b32_e32 v21, s26 -; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: v_mov_b32_e32 v19, s16 +; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v19, s28 -; VI-NEXT: v_mov_b32_e32 v20, s29 +; VI-NEXT: v_mov_b32_e32 v29, s20 +; VI-NEXT: v_mov_b32_e32 v30, s21 +; VI-NEXT: v_mov_b32_e32 v27, s22 +; VI-NEXT: v_mov_b32_e32 v28, s23 +; VI-NEXT: v_mov_b32_e32 v25, s24 +; VI-NEXT: v_mov_b32_e32 v26, s25 +; VI-NEXT: v_mov_b32_e32 v23, s26 +; VI-NEXT: v_mov_b32_e32 v24, s27 +; VI-NEXT: v_mov_b32_e32 v21, s28 +; VI-NEXT: v_mov_b32_e32 v22, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -39565,20 +39525,20 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; VI-NEXT: s_cbranch_execnz .LBB49_3 ; VI-NEXT: .LBB49_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -39589,13 +39549,13 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; VI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; VI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 @@ -39612,49 +39572,49 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; VI-NEXT: .LBB49_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 -; VI-NEXT: v_or_b32_sdwa v38, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; VI-NEXT: v_or_b32_sdwa v39, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 -; VI-NEXT: v_or_b32_sdwa v48, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v48, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v49 -; VI-NEXT: v_or_b32_sdwa v49, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v49, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v50 -; VI-NEXT: v_or_b32_sdwa v50, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v50, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 -; VI-NEXT: v_or_b32_sdwa v51, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v51, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 -; VI-NEXT: v_or_b32_sdwa v52, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v52, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 -; VI-NEXT: v_or_b32_sdwa v53, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v53, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v59 ; VI-NEXT: v_or_b32_sdwa v30, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 @@ -39754,21 +39714,21 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_mov_b32_e32 v17, s16 -; GFX9-NEXT: v_mov_b32_e32 v18, s17 -; GFX9-NEXT: v_mov_b32_e32 v29, s18 -; GFX9-NEXT: v_mov_b32_e32 v30, s19 -; GFX9-NEXT: v_mov_b32_e32 v27, s20 -; GFX9-NEXT: v_mov_b32_e32 v28, s21 -; GFX9-NEXT: v_mov_b32_e32 v25, s22 -; GFX9-NEXT: v_mov_b32_e32 v26, s23 -; GFX9-NEXT: v_mov_b32_e32 v23, s24 -; GFX9-NEXT: v_mov_b32_e32 v24, s25 -; GFX9-NEXT: v_mov_b32_e32 v21, s26 -; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v19, s28 -; GFX9-NEXT: v_mov_b32_e32 v20, s29 +; GFX9-NEXT: v_mov_b32_e32 v29, s20 +; GFX9-NEXT: v_mov_b32_e32 v30, s21 +; GFX9-NEXT: v_mov_b32_e32 v27, s22 +; GFX9-NEXT: v_mov_b32_e32 v28, s23 +; GFX9-NEXT: v_mov_b32_e32 v25, s24 +; GFX9-NEXT: v_mov_b32_e32 v26, s25 +; GFX9-NEXT: v_mov_b32_e32 v23, s26 +; GFX9-NEXT: v_mov_b32_e32 v24, s27 +; GFX9-NEXT: v_mov_b32_e32 v21, s28 +; GFX9-NEXT: v_mov_b32_e32 v22, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -39799,20 +39759,20 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; GFX9-NEXT: s_cbranch_execnz .LBB49_3 ; GFX9-NEXT: .LBB49_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -39823,13 +39783,13 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 @@ -39846,60 +39806,60 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; GFX9-NEXT: .LBB49_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 ; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 ; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v30 ; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 ; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 ; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 ; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v50, v50, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v30, v59, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v51, v51, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 ; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v58, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v18, v57, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v56, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX9-NEXT: v_lshl_or_b32 v20, v47, 16, v0 @@ -40424,8 +40384,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v11 @@ -40669,8 +40629,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v1, v1, v54 -; SI-NEXT: v_or_b32_e32 v2, v2, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v59 +; SI-NEXT: v_or_b32_e32 v2, v2, v54 ; SI-NEXT: v_or_b32_e32 v3, v3, v58 ; SI-NEXT: v_or_b32_e32 v4, v4, v53 ; SI-NEXT: v_or_b32_e32 v5, v5, v57 @@ -40697,8 +40657,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v27, v27, v32 ; SI-NEXT: v_or_b32_e32 v28, v28, v63 ; SI-NEXT: v_or_b32_e32 v29, v29, v62 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr57 @@ -40830,8 +40790,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_or_b32_e32 v2, v59, v2 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v2, v54, v2 ; SI-NEXT: v_or_b32_e32 v3, v58, v3 ; SI-NEXT: v_or_b32_e32 v4, v53, v4 ; SI-NEXT: v_or_b32_e32 v5, v57, v5 @@ -41058,10 +41018,10 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v29, 3 ; VI-NEXT: v_add_u16_e32 v0, 3, v61 ; VI-NEXT: v_add_u16_sdwa v1, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v2, 3, v60 -; VI-NEXT: v_add_u16_sdwa v3, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: v_add_u16_e32 v1, 3, v60 +; VI-NEXT: v_add_u16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_add_u16_e32 v2, 3, v59 ; VI-NEXT: v_add_u16_sdwa v3, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -41256,8 +41216,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v45 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v46 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v47 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v56 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v57 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v59 @@ -41293,8 +41253,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 @@ -41336,8 +41296,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr63 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr36 ; GFX9-NEXT: ; implicit-def: $vgpr62 @@ -41451,8 +41411,8 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 ; GFX9-NEXT: v_perm_b32 v3, v36, v58, s6 ; GFX9-NEXT: v_perm_b32 v4, v35, v57, s6 -; GFX9-NEXT: v_perm_b32 v5, v34, v56, s6 -; GFX9-NEXT: v_perm_b32 v6, v63, v47, s6 +; GFX9-NEXT: v_perm_b32 v5, v63, v56, s6 +; GFX9-NEXT: v_perm_b32 v6, v34, v47, s6 ; GFX9-NEXT: v_perm_b32 v7, v33, v46, s6 ; GFX9-NEXT: v_perm_b32 v8, v32, v45, s6 ; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] @@ -41745,43 +41705,41 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v60, v16 -; SI-NEXT: v_mov_b32_e32 v53, v14 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v62, v12 -; SI-NEXT: v_mov_b32_e32 v32, v10 -; SI-NEXT: v_mov_b32_e32 v55, v8 -; SI-NEXT: v_mov_b32_e32 v37, v6 -; SI-NEXT: v_mov_b32_e32 v41, v4 -; SI-NEXT: v_mov_b32_e32 v44, v2 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v63, v0 +; SI-NEXT: v_mov_b32_e32 v53, v16 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v55, v12 +; SI-NEXT: v_mov_b32_e32 v46, v10 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: v_mov_b32_e32 v56, v6 +; SI-NEXT: v_mov_b32_e32 v33, v4 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v35, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v39, v26 -; SI-NEXT: v_mov_b32_e32 v48, v24 -; SI-NEXT: v_mov_b32_e32 v49, v22 -; SI-NEXT: v_mov_b32_e32 v47, v20 -; SI-NEXT: v_mov_b32_e32 v50, v18 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_mov_b32_e32 v42, v26 +; SI-NEXT: v_mov_b32_e32 v43, v24 +; SI-NEXT: v_mov_b32_e32 v52, v22 +; SI-NEXT: v_mov_b32_e32 v44, v20 +; SI-NEXT: v_mov_b32_e32 v45, v18 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 @@ -41789,15 +41747,15 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill @@ -41809,105 +41767,104 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v7, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v7, v0, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v8, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v9, v0, v28 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: v_or_b32_e32 v10, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 -; SI-NEXT: v_or_b32_e32 v17, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v18, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v19, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v16, v0, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v17, v0, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v18, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v19, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v45 +; SI-NEXT: v_or_b32_e32 v20, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v43 +; SI-NEXT: v_or_b32_e32 v21, v0, v40 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s18, 0xffff -; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_and_b32 s6, s20, 0xffff -; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_and_b32 s7, s22, 0xffff -; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: s_or_b32 s7, s7, s8 -; SI-NEXT: s_and_b32 s8, s24, 0xffff -; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v26 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xffff -; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: s_or_b32 s9, s9, s10 -; SI-NEXT: s_and_b32 s10, s28, 0xffff -; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_mov_b32_e32 v6, s12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 -; SI-NEXT: v_or_b32_e32 v23, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v22, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v23, v0, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v38 +; SI-NEXT: v_or_b32_e32 v24, v0, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v25, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v26, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 -; SI-NEXT: v_or_b32_e32 v25, v0, v36 +; SI-NEXT: v_or_b32_e32 v27, v0, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v27, v0, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v29, v0, v33 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_or_b32_e32 v28, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v29, v0, v36 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -41944,125 +41901,126 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 +; SI-NEXT: v_or_b32_e32 v0, v49, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v35, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v54, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: .LBB51_3: ; %end @@ -42085,67 +42043,64 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v45, v43 -; SI-NEXT: v_mov_b32_e32 v44, v42 -; SI-NEXT: v_mov_b32_e32 v43, v41 -; SI-NEXT: v_mov_b32_e32 v42, v40 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v41, v51 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v40, v50 +; SI-NEXT: v_mov_b32_e32 v51, v49 +; SI-NEXT: v_mov_b32_e32 v50, v48 +; SI-NEXT: v_mov_b32_e32 v49, v39 +; SI-NEXT: v_mov_b32_e32 v48, v38 +; SI-NEXT: v_mov_b32_e32 v39, v37 ; SI-NEXT: v_mov_b32_e32 v38, v36 ; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v36, v55 -; SI-NEXT: v_mov_b32_e32 v55, v34 -; SI-NEXT: v_mov_b32_e32 v35, v54 -; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v36, v34 +; SI-NEXT: v_mov_b32_e32 v35, v33 ; SI-NEXT: v_mov_b32_e32 v34, v32 ; SI-NEXT: v_mov_b32_e32 v33, v62 -; SI-NEXT: v_mov_b32_e32 v62, v60 ; SI-NEXT: v_mov_b32_e32 v32, v63 -; SI-NEXT: v_mov_b32_e32 v63, v53 -; SI-NEXT: v_mov_b32_e32 v53, v61 -; SI-NEXT: v_mov_b32_e32 v61, v52 -; SI-NEXT: v_mov_b32_e32 v52, v59 -; SI-NEXT: v_mov_b32_e32 v59, v51 -; SI-NEXT: v_mov_b32_e32 v51, v57 -; SI-NEXT: v_mov_b32_e32 v57, v50 -; SI-NEXT: v_mov_b32_e32 v50, v47 -; SI-NEXT: v_mov_b32_e32 v47, v48 -; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v61, v59 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v59, v57 +; SI-NEXT: v_mov_b32_e32 v57, v47 +; SI-NEXT: v_mov_b32_e32 v47, v55 +; SI-NEXT: v_mov_b32_e32 v55, v54 +; SI-NEXT: v_mov_b32_e32 v54, v53 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_mov_b32_e32 v45, v52 +; SI-NEXT: v_mov_b32_e32 v52, v43 +; SI-NEXT: v_mov_b32_e32 v43, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v47 -; SI-NEXT: v_mov_b32_e32 v47, v50 -; SI-NEXT: v_mov_b32_e32 v50, v57 -; SI-NEXT: v_mov_b32_e32 v57, v51 -; SI-NEXT: v_mov_b32_e32 v51, v59 -; SI-NEXT: v_mov_b32_e32 v59, v52 -; SI-NEXT: v_mov_b32_e32 v52, v61 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_mov_b32_e32 v53, v63 -; SI-NEXT: v_mov_b32_e32 v63, v32 ; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v30, v43 +; SI-NEXT: v_mov_b32_e32 v43, v52 +; SI-NEXT: v_mov_b32_e32 v52, v45 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v55, v47 +; SI-NEXT: v_mov_b32_e32 v47, v57 +; SI-NEXT: v_mov_b32_e32 v57, v59 +; SI-NEXT: v_mov_b32_e32 v59, v61 +; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 ; SI-NEXT: v_mov_b32_e32 v32, v34 -; SI-NEXT: v_mov_b32_e32 v33, v54 -; SI-NEXT: v_mov_b32_e32 v54, v35 -; SI-NEXT: v_mov_b32_e32 v34, v55 -; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v33, v35 +; SI-NEXT: v_mov_b32_e32 v34, v36 ; SI-NEXT: v_mov_b32_e32 v35, v37 ; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v37, v40 -; SI-NEXT: v_mov_b32_e32 v38, v41 -; SI-NEXT: v_mov_b32_e32 v40, v42 -; SI-NEXT: v_mov_b32_e32 v41, v43 -; SI-NEXT: v_mov_b32_e32 v42, v44 -; SI-NEXT: v_mov_b32_e32 v43, v45 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v39 +; SI-NEXT: v_mov_b32_e32 v38, v48 +; SI-NEXT: v_mov_b32_e32 v39, v49 +; SI-NEXT: v_mov_b32_e32 v48, v50 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_mov_b32_e32 v50, v40 +; SI-NEXT: v_mov_b32_e32 v51, v41 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v60i16_to_v15f64_scalar: @@ -42193,71 +42148,71 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -42276,125 +42231,125 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; VI-NEXT: v_mov_b32_e32 v13, s63 ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v0, 16 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v55 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v54 -; VI-NEXT: v_lshlrev_b32_sdwa v3, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v51 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v1, 16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v54 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v50 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_and_b32 s4, s16, 0xffff ; VI-NEXT: s_lshl_b32 s5, s43, 16 ; VI-NEXT: s_add_i32 s17, s17, 3 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s17, 0xffff ; VI-NEXT: s_lshl_b32 s16, s42, 16 ; VI-NEXT: s_add_i32 s18, s18, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s5, s16, s5 ; VI-NEXT: s_and_b32 s16, s18, 0xffff ; VI-NEXT: s_lshl_b32 s17, s41, 16 ; VI-NEXT: s_add_i32 s19, s19, 3 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v38 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v38 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s16, s17, s16 ; VI-NEXT: s_and_b32 s17, s19, 0xffff ; VI-NEXT: s_lshl_b32 s18, s40, 16 ; VI-NEXT: s_add_i32 s20, s20, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s17, s18, s17 ; VI-NEXT: s_and_b32 s18, s20, 0xffff ; VI-NEXT: s_lshl_b32 s15, s15, 16 ; VI-NEXT: s_add_i32 s21, s21, 3 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v37 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s15, s15, s18 ; VI-NEXT: s_and_b32 s18, s21, 0xffff ; VI-NEXT: s_lshl_b32 s14, s14, 16 ; VI-NEXT: s_add_i32 s22, s22, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s14, s14, s18 ; VI-NEXT: s_and_b32 s18, s22, 0xffff ; VI-NEXT: s_lshl_b32 s13, s13, 16 ; VI-NEXT: s_add_i32 s23, s23, 3 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s13, s13, s18 ; VI-NEXT: s_and_b32 s18, s23, 0xffff ; VI-NEXT: s_lshl_b32 s12, s12, 16 ; VI-NEXT: s_add_i32 s24, s24, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s12, s12, s18 ; VI-NEXT: s_and_b32 s18, s24, 0xffff ; VI-NEXT: s_lshl_b32 s11, s11, 16 ; VI-NEXT: s_add_i32 s25, s25, 3 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v35 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s11, s11, s18 ; VI-NEXT: s_and_b32 s18, s25, 0xffff ; VI-NEXT: s_lshl_b32 s10, s10, 16 ; VI-NEXT: s_add_i32 s26, s26, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s10, s10, s18 ; VI-NEXT: s_and_b32 s18, s26, 0xffff ; VI-NEXT: s_lshl_b32 s9, s9, 16 ; VI-NEXT: s_add_i32 s27, s27, 3 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v34 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s9, s9, s18 ; VI-NEXT: s_and_b32 s18, s27, 0xffff ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_add_i32 s28, s28, 3 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s8, s8, s18 ; VI-NEXT: s_and_b32 s18, s28, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_add_i32 s29, s29, 3 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: v_lshlrev_b32_sdwa v2, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s7, s7, s18 ; VI-NEXT: s_and_b32 s18, s29, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_or_b32 s6, s6, s18 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v32 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x30000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_add_i32 s4, s4, 0x30000 ; VI-NEXT: s_add_i32 s5, s5, 0x30000 ; VI-NEXT: s_add_i32 s16, s16, 0x30000 @@ -42409,7 +42364,7 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; VI-NEXT: s_add_i32 s8, s8, 0x30000 ; VI-NEXT: s_add_i32 s7, s7, 0x30000 ; VI-NEXT: s_add_i32 s6, s6, 0x30000 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x30000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -42516,6 +42471,8 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 @@ -42541,10 +42498,8 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -42562,8 +42517,8 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v13, s19 ; GFX9-NEXT: s_cbranch_execnz .LBB51_3 ; GFX9-NEXT: .LBB51_2: ; %cmp.true -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v55 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v54 ; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v53 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v52 ; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v51 @@ -42578,8 +42533,8 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v0, v63, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v62, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v18, v59, 16, v18 @@ -42594,8 +42549,6 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v27 ; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v28 ; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v29 -; GFX9-NEXT: v_pk_add_u16 v14, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v0, s6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v1, s7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v2, s8, 3 op_sel_hi:[1,0] @@ -42610,6 +42563,8 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX9-NEXT: v_pk_add_u16 v11, s17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v12, s18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] @@ -42740,61 +42695,77 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3 ; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, s1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s2, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s5, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s7, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s8, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s12, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s13, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s14, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0] @@ -42807,22 +42778,6 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: .LBB51_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -43112,27 +43067,45 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -43149,38 +43122,39 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; kill: killed $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -43198,81 +43172,82 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v63, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 @@ -43289,67 +43264,44 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v43, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v27 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v31, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v58 -; SI-NEXT: v_mov_b32_e32 v58, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v60 -; SI-NEXT: v_mov_b32_e32 v60, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v62 -; SI-NEXT: v_mov_b32_e32 v62, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v61 -; SI-NEXT: v_mov_b32_e32 v61, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v63 -; SI-NEXT: v_mov_b32_e32 v63, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v48 -; SI-NEXT: v_mov_b32_e32 v48, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 @@ -43369,65 +43321,79 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_add_f64 v[49:50], v[3:4], 1.0 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v35 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_mov_b32_e32 v63, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v20 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 +; SI-NEXT: v_mov_b32_e32 v59, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 +; SI-NEXT: v_add_f64 v[45:46], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[41:42], v[3:4], 1.0 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 -; SI-NEXT: v_add_f64 v[32:33], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v60 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 @@ -43446,286 +43412,289 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v41, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v5 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: v_mov_b32_e32 v33, v29 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_mov_b32_e32 v61, v27 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -43733,42 +43702,30 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -44380,6 +44337,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_mov_b32_e32 v23, s16 ; SI-NEXT: v_mov_b32_e32 v24, s17 ; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v30, s19 ; SI-NEXT: v_mov_b32_e32 v27, s20 ; SI-NEXT: v_mov_b32_e32 v28, s21 @@ -44389,7 +44347,6 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_mov_b32_e32 v22, s25 ; SI-NEXT: v_mov_b32_e32 v19, s26 ; SI-NEXT: v_mov_b32_e32 v20, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v17, s28 ; SI-NEXT: v_mov_b32_e32 v18, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -44626,7 +44583,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v56 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v29 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 @@ -44653,7 +44610,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 @@ -44692,8 +44649,8 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v53, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 ; SI-NEXT: v_mov_b32_e32 v33, v15 ; SI-NEXT: v_mov_b32_e32 v31, v16 ; SI-NEXT: v_mov_b32_e32 v62, v13 @@ -45085,21 +45042,21 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: v_mov_b32_e32 v17, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v29, s18 -; VI-NEXT: v_mov_b32_e32 v30, s19 -; VI-NEXT: v_mov_b32_e32 v27, s20 -; VI-NEXT: v_mov_b32_e32 v28, s21 -; VI-NEXT: v_mov_b32_e32 v25, s22 -; VI-NEXT: v_mov_b32_e32 v26, s23 -; VI-NEXT: v_mov_b32_e32 v23, s24 -; VI-NEXT: v_mov_b32_e32 v24, s25 -; VI-NEXT: v_mov_b32_e32 v21, s26 -; VI-NEXT: v_mov_b32_e32 v22, s27 +; VI-NEXT: v_mov_b32_e32 v19, s16 +; VI-NEXT: v_mov_b32_e32 v20, s17 +; VI-NEXT: v_mov_b32_e32 v17, s18 +; VI-NEXT: v_mov_b32_e32 v18, s19 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_mov_b32_e32 v19, s28 -; VI-NEXT: v_mov_b32_e32 v20, s29 +; VI-NEXT: v_mov_b32_e32 v29, s20 +; VI-NEXT: v_mov_b32_e32 v30, s21 +; VI-NEXT: v_mov_b32_e32 v27, s22 +; VI-NEXT: v_mov_b32_e32 v28, s23 +; VI-NEXT: v_mov_b32_e32 v25, s24 +; VI-NEXT: v_mov_b32_e32 v26, s25 +; VI-NEXT: v_mov_b32_e32 v23, s26 +; VI-NEXT: v_mov_b32_e32 v24, s27 +; VI-NEXT: v_mov_b32_e32 v21, s28 +; VI-NEXT: v_mov_b32_e32 v22, s29 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -45130,20 +45087,20 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; VI-NEXT: s_cbranch_execnz .LBB53_3 ; VI-NEXT: .LBB53_2: ; %cmp.true ; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -45154,13 +45111,13 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; VI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; VI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 @@ -45177,49 +45134,49 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; VI-NEXT: .LBB53_3: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_or_b32_sdwa v32, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; VI-NEXT: v_or_b32_sdwa v33, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 -; VI-NEXT: v_or_b32_sdwa v34, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; VI-NEXT: v_or_b32_sdwa v33, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; VI-NEXT: v_or_b32_sdwa v34, v17, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 -; VI-NEXT: v_or_b32_sdwa v35, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v35, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; VI-NEXT: v_or_b32_sdwa v36, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v36, v29, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 -; VI-NEXT: v_or_b32_sdwa v37, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v37, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 -; VI-NEXT: v_or_b32_sdwa v38, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v38, v27, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; VI-NEXT: v_or_b32_sdwa v39, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v39, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 -; VI-NEXT: v_or_b32_sdwa v48, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v48, v25, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v49 -; VI-NEXT: v_or_b32_sdwa v49, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v49, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v50 -; VI-NEXT: v_or_b32_sdwa v50, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v50, v23, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 -; VI-NEXT: v_or_b32_sdwa v51, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v51, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v52 -; VI-NEXT: v_or_b32_sdwa v52, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v52, v21, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 -; VI-NEXT: v_or_b32_sdwa v53, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v53, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v59 ; VI-NEXT: v_or_b32_sdwa v30, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 @@ -45319,21 +45276,21 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_mov_b32_e32 v17, s16 -; GFX9-NEXT: v_mov_b32_e32 v18, s17 -; GFX9-NEXT: v_mov_b32_e32 v29, s18 -; GFX9-NEXT: v_mov_b32_e32 v30, s19 -; GFX9-NEXT: v_mov_b32_e32 v27, s20 -; GFX9-NEXT: v_mov_b32_e32 v28, s21 -; GFX9-NEXT: v_mov_b32_e32 v25, s22 -; GFX9-NEXT: v_mov_b32_e32 v26, s23 -; GFX9-NEXT: v_mov_b32_e32 v23, s24 -; GFX9-NEXT: v_mov_b32_e32 v24, s25 -; GFX9-NEXT: v_mov_b32_e32 v21, s26 -; GFX9-NEXT: v_mov_b32_e32 v22, s27 +; GFX9-NEXT: v_mov_b32_e32 v19, s16 +; GFX9-NEXT: v_mov_b32_e32 v20, s17 +; GFX9-NEXT: v_mov_b32_e32 v17, s18 +; GFX9-NEXT: v_mov_b32_e32 v18, s19 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: v_mov_b32_e32 v19, s28 -; GFX9-NEXT: v_mov_b32_e32 v20, s29 +; GFX9-NEXT: v_mov_b32_e32 v29, s20 +; GFX9-NEXT: v_mov_b32_e32 v30, s21 +; GFX9-NEXT: v_mov_b32_e32 v27, s22 +; GFX9-NEXT: v_mov_b32_e32 v28, s23 +; GFX9-NEXT: v_mov_b32_e32 v25, s24 +; GFX9-NEXT: v_mov_b32_e32 v26, s25 +; GFX9-NEXT: v_mov_b32_e32 v23, s26 +; GFX9-NEXT: v_mov_b32_e32 v24, s27 +; GFX9-NEXT: v_mov_b32_e32 v21, s28 +; GFX9-NEXT: v_mov_b32_e32 v22, s29 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill @@ -45364,20 +45321,20 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; GFX9-NEXT: s_cbranch_execnz .LBB53_3 ; GFX9-NEXT: .LBB53_2: ; %cmp.true ; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 @@ -45388,13 +45345,13 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v14 ; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v13 @@ -45411,60 +45368,60 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v26 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v28 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v19 ; GFX9-NEXT: .LBB53_3: ; %end ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 ; GFX9-NEXT: v_lshl_or_b32 v34, v34, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v30 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v18 ; GFX9-NEXT: v_lshl_or_b32 v35, v35, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v29 ; GFX9-NEXT: v_lshl_or_b32 v36, v36, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v30 ; GFX9-NEXT: v_lshl_or_b32 v37, v37, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v27 ; GFX9-NEXT: v_lshl_or_b32 v38, v38, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v28 ; GFX9-NEXT: v_lshl_or_b32 v39, v39, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v25 ; GFX9-NEXT: v_lshl_or_b32 v48, v48, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v26 ; GFX9-NEXT: v_lshl_or_b32 v49, v49, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v23 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v50, v50, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v24 ; GFX9-NEXT: v_lshl_or_b32 v30, v59, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v51, v51, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v21 ; GFX9-NEXT: v_lshl_or_b32 v31, v31, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v52, v52, 16, v17 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v20 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v22 ; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; GFX9-NEXT: v_lshl_or_b32 v53, v53, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v17, v58, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v32, v32, 16, v19 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; GFX9-NEXT: v_lshl_or_b32 v18, v57, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v33, v33, 16, v19 ; GFX9-NEXT: v_lshl_or_b32 v19, v56, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX9-NEXT: v_lshl_or_b32 v20, v47, 16, v0 @@ -46372,23 +46329,29 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v56 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v24, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v50 @@ -46413,20 +46376,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload @@ -46434,175 +46386,178 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload @@ -46852,10 +46807,10 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v29, 0x200 ; VI-NEXT: v_add_f16_sdwa v0, v61, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v61 -; VI-NEXT: v_add_f16_sdwa v2, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v60 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_add_f16_sdwa v1, v60, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v60 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_add_f16_sdwa v2, v59, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, 0x200, v59 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -47547,48 +47502,43 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:52 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v45, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v15 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v55, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 @@ -47598,10 +47548,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 @@ -47611,37 +47560,41 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v54 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 @@ -47653,7 +47606,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v62 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 @@ -47680,46 +47633,48 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_or_b32_e32 v13, v43, v13 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v48, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_or_b32_e32 v12, v45, v12 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 ; SI-NEXT: v_mov_b32_e32 v57, v39 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v39 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v39, v8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 ; SI-NEXT: v_or_b32_e32 v4, v9, v4 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -47728,328 +47683,313 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v19, v21, v19 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 ; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v7, v49, v7 ; SI-NEXT: v_mov_b32_e32 v56, v34 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_mov_b32_e32 v62, v36 +; SI-NEXT: v_mov_b32_e32 v60, v35 ; SI-NEXT: v_or_b32_e32 v11, v35, v11 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 -; SI-NEXT: v_or_b32_e32 v12, v62, v12 +; SI-NEXT: v_mov_b32_e32 v47, v63 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_or_b32_e32 v13, v43, v13 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 ; SI-NEXT: v_or_b32_e32 v14, v55, v14 ; SI-NEXT: v_or_b32_e32 v15, v61, v15 ; SI-NEXT: v_or_b32_e32 v20, v53, v20 ; SI-NEXT: v_or_b32_e32 v21, v51, v21 -; SI-NEXT: v_or_b32_e32 v22, v30, v22 -; SI-NEXT: v_or_b32_e32 v23, v31, v23 +; SI-NEXT: v_or_b32_e32 v22, v31, v22 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_or_b32_e32 v17, v32, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v16, v43, v16 +; SI-NEXT: v_or_b32_e32 v17, v32, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v35, v39 +; SI-NEXT: v_or_b32_e32 v16, v45, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 -; SI-NEXT: v_or_b32_e32 v9, v39, v9 -; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v36, v39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v38, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v37 +; SI-NEXT: v_or_b32_e32 v9, v39, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_or_b32_e32 v29, v54, v29 ; SI-NEXT: v_mov_b32_e32 v54, v32 ; SI-NEXT: s_branch .LBB55_3 ; SI-NEXT: .LBB55_2: +; SI-NEXT: v_mov_b32_e32 v48, v1 ; SI-NEXT: v_mov_b32_e32 v54, v53 ; SI-NEXT: v_mov_b32_e32 v53, v52 ; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v30 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v31 +; SI-NEXT: v_mov_b32_e32 v51, v30 +; SI-NEXT: v_mov_b32_e32 v50, v31 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v47, v36 -; SI-NEXT: v_mov_b32_e32 v46, v35 -; SI-NEXT: v_mov_b32_e32 v44, v43 -; SI-NEXT: v_mov_b32_e32 v30, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v62, v36 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v30, v51 ; SI-NEXT: v_mov_b32_e32 v51, v52 ; SI-NEXT: v_mov_b32_e32 v52, v53 ; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v57, v39 ; SI-NEXT: v_mov_b32_e32 v56, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v45, v62 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: v_mov_b32_e32 v47, v63 +; SI-NEXT: v_mov_b32_e32 v44, v43 ; SI-NEXT: v_mov_b32_e32 v42, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: v_mov_b32_e32 v31, v48 -; SI-NEXT: v_mov_b32_e32 v48, v49 +; SI-NEXT: v_mov_b32_e32 v31, v50 ; SI-NEXT: .LBB55_3: ; %Flow ; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB55_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v57 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v62 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 ; SI-NEXT: v_mov_b32_e32 v55, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v51 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v36 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v37 +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v11, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v32 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v47 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v60 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v16, v14 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 @@ -48059,30 +47999,38 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v52 ; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v23, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_or_b32_e32 v23, v25, v23 ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 @@ -48194,71 +48142,71 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; VI-NEXT: s_and_b32 s5, 0xffff, s17 ; VI-NEXT: s_lshl_b32 s44, s42, 16 ; VI-NEXT: v_or_b32_sdwa v14, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s5, s5, s44 ; VI-NEXT: s_and_b32 s44, 0xffff, s18 ; VI-NEXT: s_lshl_b32 s45, s41, 16 -; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v15, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s44, s44, s45 ; VI-NEXT: s_and_b32 s45, 0xffff, s19 ; VI-NEXT: s_lshl_b32 s46, s40, 16 -; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v16, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s45, s45, s46 ; VI-NEXT: s_and_b32 s46, 0xffff, s20 ; VI-NEXT: s_lshl_b32 s47, s15, 16 -; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v17, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s46, s46, s47 ; VI-NEXT: s_and_b32 s47, 0xffff, s21 ; VI-NEXT: s_lshl_b32 s56, s14, 16 -; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v18, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s47, s47, s56 ; VI-NEXT: s_and_b32 s56, 0xffff, s22 ; VI-NEXT: s_lshl_b32 s57, s13, 16 -; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v19, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s56, s56, s57 ; VI-NEXT: s_and_b32 s57, 0xffff, s23 ; VI-NEXT: s_lshl_b32 s58, s12, 16 -; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v20, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s57, s57, s58 ; VI-NEXT: s_and_b32 s58, 0xffff, s24 ; VI-NEXT: s_lshl_b32 s59, s11, 16 -; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v21, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s58, s58, s59 ; VI-NEXT: s_and_b32 s59, 0xffff, s25 ; VI-NEXT: s_lshl_b32 s60, s10, 16 -; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v22, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s59, s59, s60 ; VI-NEXT: s_and_b32 s60, 0xffff, s26 ; VI-NEXT: s_lshl_b32 s61, s9, 16 -; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v23, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s60, s60, s61 ; VI-NEXT: s_and_b32 s61, 0xffff, s27 ; VI-NEXT: s_lshl_b32 s62, s8, 16 -; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v24, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s61, s61, s62 ; VI-NEXT: s_and_b32 s62, 0xffff, s28 ; VI-NEXT: s_lshl_b32 s63, s7, 16 -; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v25, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_or_b32 s62, s62, s63 ; VI-NEXT: s_and_b32 s63, 0xffff, s29 ; VI-NEXT: s_lshl_b32 s72, s6, 16 -; VI-NEXT: v_lshlrev_b32_sdwa v2, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v26, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_or_b32 s63, s63, s72 ; VI-NEXT: v_or_b32_sdwa v27, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_sdwa v1, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: s_or_b32 s63, s63, s72 -; VI-NEXT: v_or_b32_sdwa v15, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -48279,13 +48227,13 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; VI-NEXT: .LBB55_2: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v13, 0x200 ; VI-NEXT: v_mov_b32_e32 v0, s43 -; VI-NEXT: v_mov_b32_e32 v2, s42 ; VI-NEXT: v_add_f16_sdwa v0, v0, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, s16, v13 -; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, s17, v13 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v2 +; VI-NEXT: v_mov_b32_e32 v1, s42 +; VI-NEXT: v_add_f16_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, s17, v13 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_mov_b32_e32 v2, s41 ; VI-NEXT: v_add_f16_sdwa v2, v2, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, s18, v13 @@ -48474,6 +48422,8 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v52 @@ -48499,10 +48449,8 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; GFX9-NEXT: v_lshl_or_b32 v27, v42, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; GFX9-NEXT: v_lshl_or_b32 v28, v41, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v32 -; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v29, v40, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 @@ -48536,9 +48484,9 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v34 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v33 ; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v32 +; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_mov_b32_e32 v13, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v14, v63, 16, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x200 ; GFX9-NEXT: v_lshl_or_b32 v15, v62, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v16, v61, 16, v16 ; GFX9-NEXT: v_lshl_or_b32 v17, v60, 16, v17 @@ -48700,61 +48648,77 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s29, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s28, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s27, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s26, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s25, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s24, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s23, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s22, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s21, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s20, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s19, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s18, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s40, s0, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s42, 0 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s40, s0, s40 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s41, s1, s41 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s46 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s3, s45 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s16, s44 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s17, s43 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s18, s4 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s19, s5 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s20, s6 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s21, s7 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s22, s8 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s23, s9 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s24, s10 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s25, s11 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s26, s12 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s27, s13 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s28, s14 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s29, s15 +; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s15, s27, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s14, s26, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s13, s25, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s24, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s23, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s22, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s21, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s20, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s19, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s18, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s17, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s16, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s3, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s2, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s0, 16 +; GFX11-TRUE16-NEXT: s_mov_b32 s40, 0 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s46 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s45 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s44 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s43 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s17, s5 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s18, s6 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s19, s7 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s20, s8 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s21, s9 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s22, s10 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s23, s11 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s24, s12 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s13, s25, s13 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s14, s26, s14 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s15, s27, s15 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s16, s28, s42 +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s17, s29, s41 ; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB55_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s14 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s15 -; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s42 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v27, s6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v35, s7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v44, s8 :: v_dual_mov_b32 v65, s10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, s9 :: v_dual_mov_b32 v77, s11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v119, s14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v104, s13 :: v_dual_mov_b32 v135, s15 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v152, s16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v170, s17 +; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s40 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB55_3 ; GFX11-TRUE16-NEXT: .LBB55_2: ; %cmp.true -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, s1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s2 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s5 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s7 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s8 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s12 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s13 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s14 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1] @@ -48767,22 +48731,6 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: .LBB55_3: ; %end ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27 @@ -50694,7 +50642,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 @@ -50710,221 +50658,222 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB57_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s29 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v2 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v57, v27 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v48 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v3 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v4 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v38 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v40, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v6 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v36 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v8 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v9 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v34 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v10 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s25 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v11 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v12 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v13 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v32 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v14 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s29 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 +; SI-NEXT: v_mov_b32_e32 v1, v12 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v15 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v2, v13 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v50 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v3 +; SI-NEXT: v_mov_b32_e32 v3, v14 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v16 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v4 +; SI-NEXT: v_mov_b32_e32 v4, v11 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v5 +; SI-NEXT: v_mov_b32_e32 v5, v15 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v17 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v6 +; SI-NEXT: v_mov_b32_e32 v6, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v61 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v7 +; SI-NEXT: v_mov_b32_e32 v7, v29 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v8 +; SI-NEXT: v_mov_b32_e32 v8, v30 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v9 +; SI-NEXT: v_mov_b32_e32 v9, v27 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v10 +; SI-NEXT: v_mov_b32_e32 v10, v28 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v11 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v12 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v13 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v14 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v15 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v17 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v20 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v19 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v21 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v20 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v22 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v21 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v22 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v24 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v23 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v25 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v26 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v25 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v26 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v28 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v30 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v39 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v59 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_branch .LBB57_3 ; SI-NEXT: .LBB57_2: -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v8, v30 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: v_mov_b32_e32 v7, v29 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v34 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v10, v28 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: v_mov_b32_e32 v34, v33 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v9, v27 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: v_mov_b32_e32 v33, v32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v5, v15 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 +; SI-NEXT: v_mov_b32_e32 v3, v14 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: v_mov_b32_e32 v2, v13 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: v_mov_b32_e32 v1, v12 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: v_mov_b32_e32 v4, v11 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr42 @@ -50938,71 +50887,73 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; kill: killed $vgpr49 ; SI-NEXT: .LBB57_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v49, v50 ; SI-NEXT: v_mov_b32_e32 v50, v52 ; SI-NEXT: v_mov_b32_e32 v52, v54 @@ -51011,174 +50962,149 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v42, v44 ; SI-NEXT: v_mov_b32_e32 v44, v46 ; SI-NEXT: v_mov_b32_e32 v46, v56 -; SI-NEXT: v_mov_b32_e32 v56, v31 +; SI-NEXT: v_mov_b32_e32 v56, v57 ; SI-NEXT: s_cbranch_vccnz .LBB57_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v32 -; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v35 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v1 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v10 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v9 +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v8 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v7 +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v2 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s16 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s17 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s18 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s19 ; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s20 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v58 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s21 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v60 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v60 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v61 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v62 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v62 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s23 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s24 -; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 -; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v33 ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s25 -; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 +; SI-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v34 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_add_i32_e32 v35, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v50, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v61 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v62 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v58 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v46, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v56, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v57 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) @@ -51211,6 +51137,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s29 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 @@ -51279,13 +51206,16 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: .LBB57_5: ; %end ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -51294,7 +51224,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -51303,9 +51233,9 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -51315,8 +51245,8 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -51326,7 +51256,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -51337,7 +51267,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -51347,15 +51277,13 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -51520,9 +51448,11 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -51568,7 +51498,7 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 @@ -53787,6 +53717,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-LABEL: bitcast_v60f16_to_v60i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v38, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -53803,646 +53735,699 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v31, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v54 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v55 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v25, v41 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cvt_f16_f32_e32 v41, v43 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v44 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s28 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v46 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v38 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v53 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v20, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s29 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, s25 ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v57 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v43, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v35, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v4, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v33, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 -; SI-NEXT: v_mov_b32_e32 v41, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 -; SI-NEXT: v_mov_b32_e32 v53, v27 -; SI-NEXT: v_or_b32_e32 v28, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_mov_b32_e32 v22, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_mov_b32_e32 v17, v7 +; SI-NEXT: v_or_b32_e32 v6, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v38, v55 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: v_or_b32_e32 v46, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; SI-NEXT: v_or_b32_e32 v16, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v60 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 -; SI-NEXT: v_mov_b32_e32 v40, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: v_or_b32_e32 v34, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_lshr_b64 v[46:47], v[27:28], 16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v18, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v24, v12 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v60, v1, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v13, v43 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v54, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_or_b32_e32 v58, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v30, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_or_b32_e32 v36, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_mov_b32_e32 v8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: v_or_b32_e32 v38, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v40, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_or_b32_e32 v22, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_or_b32_e32 v26, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_or_b32_e32 v7, v3, v25 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_or_b32_e32 v2, v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v26, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_or_b32_e32 v49, v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v22, v3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v32, v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v3 +; SI-NEXT: v_or_b32_e32 v62, v1, v35 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v18, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 -; SI-NEXT: v_or_b32_e32 v16, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v44, v29 +; SI-NEXT: v_or_b32_e32 v9, v5, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v61, v3, v57 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_or_b32_e32 v14, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 -; SI-NEXT: v_or_b32_e32 v12, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: v_or_b32_e32 v10, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v3, v17 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v5, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 +; SI-NEXT: v_mov_b32_e32 v44, v40 +; SI-NEXT: v_mov_b32_e32 v43, v39 +; SI-NEXT: v_lshr_b64 v[27:28], v[43:44], 16 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_or_b32_e32 v57, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 -; SI-NEXT: v_lshr_b64 v[58:59], v[34:35], 16 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshr_b64 v[55:56], v[21:22], 16 +; SI-NEXT: v_mov_b32_e32 v21, v23 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_mov_b32_e32 v59, v48 -; SI-NEXT: v_or_b32_e32 v6, v3, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v52, v39 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; SI-NEXT: v_or_b32_e32 v4, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v39 -; SI-NEXT: v_lshr_b64 v[47:48], v[17:18], 16 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 -; SI-NEXT: v_or_b32_e32 v2, v2, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v62 -; SI-NEXT: v_or_b32_e32 v62, v24, v32 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v42, v24, v27 -; SI-NEXT: v_mov_b32_e32 v48, v62 -; SI-NEXT: v_or_b32_e32 v50, v20, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 -; SI-NEXT: v_mov_b32_e32 v34, v42 -; SI-NEXT: v_lshr_b64 v[42:43], v[25:26], 16 -; SI-NEXT: v_mov_b32_e32 v62, v50 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v41, v20, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v8 -; SI-NEXT: v_or_b32_e32 v8, v38, v25 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 -; SI-NEXT: v_or_b32_e32 v8, v24, v21 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v20, v17 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v38, v15 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v38, v53 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v54, v24, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v52, v38, v9 -; SI-NEXT: v_mov_b32_e32 v27, v52 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_or_b32_e32 v36, v20, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshr_b64 v[52:53], v[15:16], 16 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v51, v24, v56 -; SI-NEXT: v_mov_b32_e32 v15, v51 -; SI-NEXT: v_lshr_b64 v[50:51], v[13:14], 16 -; SI-NEXT: v_or_b32_e32 v24, v20, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 -; SI-NEXT: v_or_b32_e32 v8, v38, v3 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v3, v3, v45 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v39, v1 -; SI-NEXT: v_lshr_b64 v[38:39], v[32:33], 16 -; SI-NEXT: v_mov_b32_e32 v32, v41 -; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[11:12], 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[20:21], v[56:57], 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v11, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 -; SI-NEXT: v_mov_b32_e32 v39, v31 -; SI-NEXT: v_mov_b32_e32 v31, v60 -; SI-NEXT: v_mov_b32_e32 v60, v61 -; SI-NEXT: v_mov_b32_e32 v61, v63 -; SI-NEXT: v_mov_b32_e32 v63, v37 -; SI-NEXT: v_mov_b32_e32 v37, v55 -; SI-NEXT: v_lshr_b64 v[55:56], v[5:6], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16 -; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: v_or_b32_e32 v24, v51, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v41, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v40 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v51 +; SI-NEXT: v_or_b32_e32 v39, v52, v12 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[39:40], v[25:26], 16 +; SI-NEXT: v_mov_b32_e32 v25, v50 +; SI-NEXT: v_lshr_b64 v[50:51], v[37:38], 16 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[51:52], v[53:54], 16 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[51:52], v[59:60], 16 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[51:52], v[17:18], 16 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[51:52], v[33:34], 16 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[51:52], v[15:16], 16 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[51:52], v[45:46], 16 +; SI-NEXT: v_mov_b32_e32 v53, v7 +; SI-NEXT: v_mov_b32_e32 v7, v50 +; SI-NEXT: v_mov_b32_e32 v29, v51 +; SI-NEXT: v_lshr_b64 v[50:51], v[3:4], 16 +; SI-NEXT: v_mov_b32_e32 v37, v58 +; SI-NEXT: v_mov_b32_e32 v40, v47 +; SI-NEXT: v_mov_b32_e32 v33, v49 +; SI-NEXT: v_mov_b32_e32 v49, v2 +; SI-NEXT: v_mov_b32_e32 v17, v46 +; SI-NEXT: v_lshr_b64 v[35:36], v[35:36], 16 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[35:36], v[57:58], 16 +; SI-NEXT: v_mov_b32_e32 v57, v13 +; SI-NEXT: v_mov_b32_e32 v56, v12 +; SI-NEXT: v_lshr_b64 v[51:52], v[56:57], 16 +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[58:59], v[5:6], 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v51, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v12, v60 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, v54 +; SI-NEXT: v_mov_b32_e32 v60, v34 +; SI-NEXT: v_mov_b32_e32 v15, v58 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, v50 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v62 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index da908bc280e6e..04812a543cf08 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -8100,66 +8100,66 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s20 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; SI-NEXT: v_lshr_b64 v[16:17], v[0:1], 16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v19 -; SI-NEXT: v_lshr_b64 v[17:18], v[5:6], 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: v_lshr_b64 v[17:18], v[0:1], 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v20 +; SI-NEXT: v_lshr_b64 v[18:19], v[5:6], 16 ; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 24 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v20 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v19 -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v17 +; SI-NEXT: v_lshr_b64 v[3:4], v[17:18], 24 +; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[17:18], 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v18 ; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v13 -; SI-NEXT: v_lshr_b64 v[14:15], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB39_3 ; SI-NEXT: .LBB39_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshr_b64 v[16:17], v[0:1], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[0:1], 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 -; SI-NEXT: v_lshr_b64 v[17:18], v[5:6], 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; SI-NEXT: v_lshr_b64 v[18:19], v[5:6], 16 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 ; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 16 -; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 24 -; SI-NEXT: v_lshr_b64 v[14:15], v[16:17], 16 -; SI-NEXT: v_lshr_b64 v[1:2], v[16:17], 8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v17 +; SI-NEXT: v_lshr_b64 v[3:4], v[17:18], 24 +; SI-NEXT: v_lshr_b64 v[15:16], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[17:18], 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v18 ; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v13 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v5 ; SI-NEXT: .LBB39_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v16 -; SI-NEXT: v_mov_b32_e32 v2, v14 -; SI-NEXT: v_mov_b32_e32 v4, v17 +; SI-NEXT: v_mov_b32_e32 v0, v17 +; SI-NEXT: v_mov_b32_e32 v2, v15 +; SI-NEXT: v_mov_b32_e32 v4, v18 ; SI-NEXT: v_mov_b32_e32 v5, v8 ; SI-NEXT: v_mov_b32_e32 v8, v13 ; SI-NEXT: v_mov_b32_e32 v9, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB39_4: -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll index 42f76c4a10d2a..2bbf57f6a50cd 100644 --- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 ; CHECK-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x0 ; CHECK-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x10 -; CHECK-NEXT: v_mov_b32_e32 v20, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0x3e21eeb6 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x9037ab78 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x3e21eeb6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0 @@ -16,32 +16,32 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_bitcmp1_b32 s0, 8 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; CHECK-NEXT: s_xor_b64 s[20:21], s[2:3], -1 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_xor_b64 s[20:21], s[2:3], -1 ; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] -; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78 -; CHECK-NEXT: v_mov_b32_e32 v2, 0xa17f65f6 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xbe927e4f -; CHECK-NEXT: v_mov_b32_e32 v4, 0x19f4ec90 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x3efa01a0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0x16c16967 -; CHECK-NEXT: v_mov_b32_e32 v7, 0xbf56c16c -; CHECK-NEXT: v_mov_b32_e32 v8, 0x69efb384 -; CHECK-NEXT: v_mov_b32_e32 v9, 0x3f4b2bb0 -; CHECK-NEXT: v_mov_b32_e32 v10, 0xa57d9582 -; CHECK-NEXT: v_mov_b32_e32 v11, 0xbf8c6ea4 -; CHECK-NEXT: v_mov_b32_e32 v12, 0xe82d3ff0 -; CHECK-NEXT: v_mov_b32_e32 v13, 0xbfa59976 -; CHECK-NEXT: v_mov_b32_e32 v14, 0x8427b883 -; CHECK-NEXT: v_mov_b32_e32 v15, 0x3fae1bb4 +; CHECK-NEXT: v_mov_b32_e32 v4, 0xa17f65f6 +; CHECK-NEXT: v_mov_b32_e32 v5, 0xbe927e4f +; CHECK-NEXT: v_mov_b32_e32 v6, 0x19f4ec90 +; CHECK-NEXT: v_mov_b32_e32 v7, 0x3efa01a0 +; CHECK-NEXT: v_mov_b32_e32 v8, 0x16c16967 +; CHECK-NEXT: v_mov_b32_e32 v9, 0xbf56c16c +; CHECK-NEXT: v_mov_b32_e32 v10, 0x69efb384 +; CHECK-NEXT: v_mov_b32_e32 v11, 0x3f4b2bb0 +; CHECK-NEXT: v_mov_b32_e32 v12, 0xa57d9582 +; CHECK-NEXT: v_mov_b32_e32 v13, 0xbf8c6ea4 +; CHECK-NEXT: v_mov_b32_e32 v14, 0xe82d3ff0 ; CHECK-NEXT: s_mov_b64 s[22:23], 0 -; CHECK-NEXT: v_mov_b32_e32 v16, 0x57b87036 -; CHECK-NEXT: v_mov_b32_e32 v17, 0x3fb3b136 ; CHECK-NEXT: s_and_b64 s[4:5], exec, s[16:17] -; CHECK-NEXT: v_mov_b32_e32 v18, 0x55555523 -; CHECK-NEXT: v_mov_b32_e32 v19, 0xbfd55555 ; CHECK-NEXT: s_and_b64 s[6:7], exec, s[18:19] -; CHECK-NEXT: v_mov_b32_e32 v21, v20 +; CHECK-NEXT: v_mov_b32_e32 v15, 0xbfa59976 +; CHECK-NEXT: v_mov_b32_e32 v16, 0x8427b883 +; CHECK-NEXT: v_mov_b32_e32 v17, 0x3fae1bb4 +; CHECK-NEXT: v_mov_b32_e32 v18, 0x57b87036 +; CHECK-NEXT: v_mov_b32_e32 v19, 0x3fb3b136 +; CHECK-NEXT: v_mov_b32_e32 v20, 0x55555523 +; CHECK-NEXT: v_mov_b32_e32 v21, 0xbfd55555 +; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: ; implicit-def: $vgpr30_vgpr31 ; CHECK-NEXT: ; implicit-def: $vgpr22_vgpr23 ; CHECK-NEXT: s_branch .LBB0_2 @@ -62,28 +62,28 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[14:15] ; CHECK-NEXT: flat_load_dwordx2 v[24:25], v[24:25] -; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[0:1] -; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[2:3] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[2:3] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[4:5] ; CHECK-NEXT: v_accvgpr_write_b32 a0, 0 ; CHECK-NEXT: v_accvgpr_write_b32 a1, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[24:25] ; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] -; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[4:5] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[6:7] ; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29] -; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[6:7] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[8:9] ; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] -; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[8:9] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[10:11] ; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29] -; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[10:11] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[12:13] ; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] -; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[12:13] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[14:15] ; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29] -; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[14:15] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[16:17] ; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] -; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[16:17] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[18:19] ; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29] -; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[18:19] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[20:21] ; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] ; CHECK-NEXT: s_branch .LBB0_6 ; CHECK-NEXT: .LBB0_5: ; %Flow @@ -103,21 +103,21 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: ; %bb.7: ; %.lr.ph2070.i.i.i3291 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v30 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v31 ; CHECK-NEXT: s_mov_b64 s[8:9], s[18:19] +; CHECK-NEXT: v_accvgpr_write_b32 a1, v31 ; CHECK-NEXT: s_mov_b64 vcc, s[6:7] ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 ; CHECK-NEXT: ; %bb.8: ; %.preheader1856.preheader.i.i.i3325 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 ; CHECK-NEXT: v_accvgpr_write_b32 a0, v28 ; CHECK-NEXT: s_mov_b64 s[24:25], 0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v29 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v29 ; CHECK-NEXT: s_branch .LBB0_5 ; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 s[22:23], 0 -; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11] ; CHECK-NEXT: s_mov_b64 s[8:9], s[20:21] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11] ; CHECK-NEXT: s_branch .LBB0_15 ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 s[8:9], -1 @@ -135,7 +135,7 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v27, v26 ; CHECK-NEXT: s_and_b64 s[8:9], exec, s[16:17] -; CHECK-NEXT: global_store_dwordx2 v20, v[26:27], s[12:13] +; CHECK-NEXT: global_store_dwordx2 v0, v[26:27], s[12:13] ; CHECK-NEXT: s_cselect_b32 s23, s23, 0 ; CHECK-NEXT: s_cselect_b32 s22, s22, 0 ; CHECK-NEXT: s_mov_b64 s[8:9], -1 @@ -154,7 +154,7 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: ; %bb.16: ; %._crit_edge2105.i.i.i2330 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 s[24:25], 0 -; CHECK-NEXT: global_store_dwordx2 v20, v[20:21], s[12:13] +; CHECK-NEXT: global_store_dwordx2 v0, v[0:1], s[12:13] ; CHECK-NEXT: s_branch .LBB0_1 ; CHECK-NEXT: .LBB0_17: ; %DummyReturnBlock ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 28d7e6916e519..2f1b7e567fa06 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -824,36 +824,37 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; GCN-NEXT: buffer_store_dword v9, v20, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; GCN-NEXT: buffer_store_dword v9, v20, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v9, vcc, 40, v0 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 ; GCN-NEXT: buffer_store_dword v8, v21, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v8, vcc, 40, v0 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 28, v0 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 24, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 28, v0 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 24, v0 -; GCN-NEXT: v_add_i32_e32 v19, vcc, 20, v0 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 20, v0 ; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v5, vcc, 12, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v6, vcc, 12, v0 +; GCN-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 8, v0 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v4, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(8) -; GCN-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v7, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -915,34 +916,34 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 44, v0 ; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 36, v0 ; GFX7-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 28, v0 ; GFX7-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; GFX7-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 20, v0 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; GFX7-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 12, v0 -; GFX7-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; GFX7-NEXT: buffer_store_dword v10, v6, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 16, v0 +; GFX7-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; GFX7-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GFX7-NEXT: v_add_i32_e32 v9, vcc, 4, v0 +; GFX7-NEXT: buffer_store_dword v8, v5, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(9) -; GFX7-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v13, v6, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v14, v19, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v11, v6, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v18, v10, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v17, v2, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v16, v9, s[0:3], 0 offen ; GFX7-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1535,165 +1536,165 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_alignbit_b32 v22, v23, v22, 16 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16 -; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_alignbit_b32 v21, v21, v20, 16 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; GCN-NEXT: v_alignbit_b32 v20, v19, v18, 16 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v16, v4, 16 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v19, v2, v16, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v2, 16 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v31 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GCN-NEXT: v_alignbit_b32 v9, v6, v14, 16 -; GCN-NEXT: v_alignbit_b32 v8, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v7, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v6, v15, v16, 16 -; GCN-NEXT: v_alignbit_b32 v12, v28, v17, 16 -; GCN-NEXT: v_alignbit_b32 v11, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v10, v25, v24, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GCN-NEXT: v_alignbit_b32 v1, v5, v23, 16 +; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; GCN-NEXT: v_alignbit_b32 v7, v7, v14, 16 +; GCN-NEXT: v_alignbit_b32 v6, v15, v32, 16 +; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16 +; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16 +; GCN-NEXT: v_alignbit_b32 v10, v18, v28, 16 +; GCN-NEXT: v_alignbit_b32 v9, v25, v26, 16 +; GCN-NEXT: v_alignbit_b32 v8, v27, v24, 16 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[12:13], s[4:7], 0 addr64 offset:32 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v26 -; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v16 +; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[12:13], s[4:7], 0 addr64 offset:16 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v6, v27, 16 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v11 +; GCN-NEXT: v_alignbit_b32 v11, v4, v17, 16 +; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[12:13], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[12:13], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_store_global_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_alignbit_b32 v27, v27, v26, 16 +; GFX7-NEXT: v_alignbit_b32 v26, v25, v24, 16 +; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v25, v25, v24, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v15 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v13 -; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 -; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v30 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_alignbit_b32 v12, v7, v12, 16 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v30 +; GFX7-NEXT: v_alignbit_b32 v28, v29, v28, 16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; GFX7-NEXT: v_alignbit_b32 v27, v29, v28, 16 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; GFX7-NEXT: v_alignbit_b32 v11, v9, v14, 16 +; GFX7-NEXT: v_alignbit_b32 v9, v13, v15, 16 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v26, v31, v26, 16 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v4, v24, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v10, v10, v12, 16 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v28, v6, v7, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8 -; GFX7-NEXT: v_alignbit_b32 v10, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22 -; GFX7-NEXT: v_alignbit_b32 v9, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v21 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_alignbit_b32 v7, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v14, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_alignbit_b32 v29, v0, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; GFX7-NEXT: v_alignbit_b32 v8, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; GFX7-NEXT: v_alignbit_b32 v15, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_dwordx4 v[25:28], v[0:1], s[4:7], 0 addr64 offset:48 -; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 -; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dwordx4 v[26:29], v[24:25], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: buffer_store_dwordx4 v[12:15], v[24:25], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: buffer_store_dwordx4 v[8:11], v[24:25], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[24:25], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1763,30 +1764,24 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_alignbit_b32 v22, v23, v22, 16 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16 -; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_alignbit_b32 v21, v21, v20, 16 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; GCN-NEXT: v_alignbit_b32 v20, v19, v18, 16 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v12, v16, v12, 16 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 @@ -1794,133 +1789,140 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v22, v14, 16 -; GCN-NEXT: v_alignbit_b32 v0, v23, v0, 16 -; GCN-NEXT: v_alignbit_b32 v6, v26, v15, 16 -; GCN-NEXT: v_alignbit_b32 v5, v16, v17, 16 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GCN-NEXT: v_alignbit_b32 v19, v4, v16, 16 +; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; GCN-NEXT: v_alignbit_b32 v13, v13, v12, 16 +; GCN-NEXT: v_alignbit_b32 v12, v11, v10, 16 +; GCN-NEXT: v_alignbit_b32 v11, v9, v8, 16 +; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v3, v5, v18, 16 +; GCN-NEXT: v_alignbit_b32 v2, v17, v2, 16 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v7, v5, v6, 16 +; GCN-NEXT: v_alignbit_b32 v6, v27, v26, 16 +; GCN-NEXT: v_alignbit_b32 v5, v25, v24, 16 ; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[9:10], s[4:7], 0 addr64 offset:32 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30 +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[9:10], s[4:7], 0 addr64 offset:16 ; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v15 ; GCN-NEXT: s_waitcnt vmcnt(13) -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v16 ; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v17 ; GCN-NEXT: s_waitcnt vmcnt(11) -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(10) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v23 +; GCN-NEXT: s_waitcnt vmcnt(8) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v11 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v18 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v7, v14, 16 -; GCN-NEXT: v_alignbit_b32 v12, v15, v16, 16 -; GCN-NEXT: v_alignbit_b32 v11, v17, v22, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v23, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v25, 16 -; GCN-NEXT: v_alignbit_b32 v16, v21, v18, 16 -; GCN-NEXT: v_alignbit_b32 v15, v26, v19, 16 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_alignbit_b32 v14, v8, v11, 16 +; GCN-NEXT: v_alignbit_b32 v13, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v12, v17, v15, 16 +; GCN-NEXT: v_alignbit_b32 v11, v0, v16, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20 ; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: s_waitcnt vmcnt(13) +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: s_waitcnt vmcnt(12) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: s_waitcnt vmcnt(11) +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: s_waitcnt vmcnt(10) +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: s_waitcnt vmcnt(9) +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v17 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v21 -; GCN-NEXT: v_alignbit_b32 v14, v7, v14, 16 -; GCN-NEXT: v_alignbit_b32 v7, v18, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v19, v20, 16 -; GCN-NEXT: v_alignbit_b32 v20, v25, v22, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v17 +; GCN-NEXT: v_alignbit_b32 v18, v18, v21, 16 +; GCN-NEXT: v_alignbit_b32 v17, v20, v24, 16 +; GCN-NEXT: v_alignbit_b32 v16, v22, v26, 16 +; GCN-NEXT: v_alignbit_b32 v15, v0, v8, 16 +; GCN-NEXT: v_alignbit_b32 v8, v25, v27, 16 +; GCN-NEXT: v_alignbit_b32 v22, v29, v28, 16 +; GCN-NEXT: v_alignbit_b32 v21, v30, v19, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8 ; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 ; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 ; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 @@ -1928,48 +1930,48 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 ; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48 ; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v23 -; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v23 +; GCN-NEXT: v_alignbit_b32 v20, v20, v0, 16 ; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v24 ; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v25 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v27 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_alignbit_b32 v25, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v24, v24, v26, 16 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v27 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v29 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_alignbit_b32 v23, v23, v22, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_alignbit_b32 v19, v0, v19, 16 +; GCN-NEXT: v_alignbit_b32 v26, v23, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v25, v27, 16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:36 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v22, v22, v26, 16 -; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[8:9], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_alignbit_b32 v24, v24, v0, 16 +; GCN-NEXT: v_alignbit_b32 v23, v23, v28, 16 +; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[9:10], s[4:7], 0 addr64 offset:112 +; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[9:10], s[4:7], 0 addr64 offset:96 +; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[9:10], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[9:10], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_store_dwordx4 v[5:8], v[9:10], s[4:7], 0 addr64 offset:48 +; GCN-NEXT: buffer_store_dwordx4 v[1:4], v[9:10], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -1988,41 +1990,72 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_alignbit_b32 v8, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30 ; GFX7-NEXT: s_waitcnt vmcnt(7) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: s_waitcnt vmcnt(6) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 ; GFX7-NEXT: v_alignbit_b32 v36, v31, v32, 16 +; GFX7-NEXT: s_waitcnt vmcnt(5) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v33 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v34 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_alignbit_b32 v35, v31, v32, 16 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v37 -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v38 ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX7-NEXT: v_alignbit_b32 v35, v33, v34, 16 ; GFX7-NEXT: v_alignbit_b32 v34, v31, v32, 16 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v39 @@ -2045,17 +2078,18 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_waitcnt vmcnt(5) ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v38 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v39 ; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v39 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v48 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: v_alignbit_b32 v35, v33, v34, 16 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v49 -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v48 -; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v50 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 ; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 @@ -2072,20 +2106,23 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 ; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 ; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49 +; GFX7-NEXT: s_waitcnt vmcnt(6) ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(5) ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39 -; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48 ; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v48 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v49 +; GFX7-NEXT: v_alignbit_b32 v35, v33, v34, 16 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50 -; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51 -; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 ; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 ; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 @@ -2102,20 +2139,23 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 ; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 ; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:80 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49 +; GFX7-NEXT: s_waitcnt vmcnt(6) ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(5) ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39 -; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48 ; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v48 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v49 +; GFX7-NEXT: v_alignbit_b32 v35, v33, v34, 16 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50 -; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51 -; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 ; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 @@ -2123,61 +2163,29 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 ; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:64 -; GFX7-NEXT: s_nop 0 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13 -; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21 -; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16 -; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v6, v0, v1, 16 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v38 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; GFX7-NEXT: v_alignbit_b32 v18, v0, v1, 16 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v28 ; GFX7-NEXT: v_alignbit_b32 v17, v0, v1, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v27 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16 -; GFX7-NEXT: v_alignbit_b32 v15, v0, v1, 16 +; GFX7-NEXT: v_alignbit_b32 v16, v0, v1, 16 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16 -; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48 -; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32 -; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[31:32], s[4:7], 0 addr64 +; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 +; GFX7-NEXT: v_alignbit_b32 v15, v0, v1, 16 +; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:64 +; GFX7-NEXT: buffer_store_dwordx4 v[15:18], v[31:32], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: buffer_store_dwordx4 v[7:10], v[31:32], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[31:32], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[31:32], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -8824,34 +8832,34 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v5 +; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v7 -; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v7 -; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v8 -; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v9 -; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v10 +; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v12 +; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v13 +; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v13 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v11 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v13 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v14 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v15 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v14 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v15 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v16 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v17 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[22:23], v22 @@ -9101,154 +9109,153 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0 -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xdc, v0 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd8, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xd4, v0 -; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xd0, v0 +; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xcc, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xc8, v0 -; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xc4, v0 +; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc0, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xbc, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xb8, v0 ; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb8, v0 -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xb4, v0 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb4, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v23, vcc, 0xb0, v0 -; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xac, v0 +; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa8, v0 ; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xa4, v0 -; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xa0, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x9c, v0 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xa0, v0 +; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x9c, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x98, v0 +; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x98, v0 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x94, v0 ; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x94, v0 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x90, v0 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x90, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x8c, v0 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x88, v0 ; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x88, v0 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x84, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x84, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x80, v0 -; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x7c, v0 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x78, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x74, v0 -; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x70, v0 -; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x6c, v0 +; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x6c, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x68, v0 -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x64, v0 -; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x60, v0 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x64, v0 +; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x60, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x5c, v0 -; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x58, v0 -; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x54, v0 +; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x54, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x4c, v0 ; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x4c, v0 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x48, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x44, v0 -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 64, v0 +; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v29, vcc, 60, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0 -; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0 +; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v28, vcc, 48, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v29, vcc, 44, v0 +; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v25, vcc, 44, v0 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 40, v0 ; GCN-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 36, v0 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 36, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0 +; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 28, v0 +; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v23, vcc, 24, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GCN-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen ; GCN-NEXT: v_add_i32_e32 v20, vcc, 20, v0 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 16, v0 ; GCN-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v26, vcc, 16, v0 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 12, v0 ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 ; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v27, vcc, 8, v0 -; GCN-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen -; GCN-NEXT: v_add_i32_e32 v19, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen +; GCN-NEXT: v_add_i32_e32 v32, vcc, 8, v0 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 4, v0 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v12 ; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -9257,39 +9264,41 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v5 ; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v6 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v8 -; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v8 +; GCN-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 ; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v11 -; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen ; GCN-NEXT: v_cvt_f64_f32_e32 v[5:6], v10 -; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen +; GCN-NEXT: v_cvt_f64_f32_e32 v[7:8], v9 +; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v9 -; GCN-NEXT: v_cvt_f64_f32_e32 v[7:8], v12 -; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v36 +; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v12 +; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v37 ; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen ; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v13 -; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen +; GCN-NEXT: v_cvt_f64_f32_e32 v[13:14], v14 +; GCN-NEXT: buffer_store_dword v3, v31, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v14 -; GCN-NEXT: v_cvt_f64_f32_e32 v[13:14], v15 +; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v15 ; GCN-NEXT: v_cvt_f64_f32_e32 v[15:16], v16 -; GCN-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v29, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v5, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v20, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -9300,15 +9309,15 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:62 -; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:60 -; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:58 -; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:56 -; GFX7-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:54 -; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:52 -; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:50 -; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:48 -; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:62 +; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:60 +; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:58 +; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:56 +; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:54 +; GFX7-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:52 +; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:50 +; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:32 ; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:34 ; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:36 ; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:38 @@ -9317,215 +9326,214 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:44 ; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:46 ; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 -; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:2 +; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:2 ; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:4 ; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:6 ; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:8 -; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10 +; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:10 ; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:12 ; GFX7-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14 ; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:18 -; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:20 +; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:20 ; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:22 ; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:24 -; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:26 +; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:26 ; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:28 ; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:30 ; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v16 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xfc, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0xfc, v0 +; GFX7-NEXT: buffer_store_dword v2, v16, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v17 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xf4, v0 -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xd8, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0xf4, v0 +; GFX7-NEXT: buffer_store_dword v2, v16, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v18 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xec, v0 -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0xd4, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0xec, v0 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xd8, v0 +; GFX7-NEXT: buffer_store_dword v2, v16, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v19 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v20 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xe4, v0 -; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0xd0, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v16 +; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0xe4, v0 +; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0xd4, v0 +; GFX7-NEXT: buffer_store_dword v2, v16, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v20 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v21 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v17 -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xdc, v0 -; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GFX7-NEXT: buffer_store_dword v1, v18, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v23 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xdc, v0 +; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX7-NEXT: buffer_store_dword v16, v18, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v22 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xd0, v0 ; GFX7-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v1, v20, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v18, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xcc, v0 -; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v23 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xc8, v0 -; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xc4, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v31 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xc8, v0 +; GFX7-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0xc4, v0 +; GFX7-NEXT: buffer_store_dword v2, v16, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v31 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc0, v0 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xbc, v0 -; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v30 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xb8, v0 -; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xb4, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v29 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xb8, v0 +; GFX7-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0xb4, v0 +; GFX7-NEXT: buffer_store_dword v2, v16, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v29 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb0, v0 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xac, v0 -; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v28 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xa8, v0 -; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xa4, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v27 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xa8, v0 +; GFX7-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0xa4, v0 +; GFX7-NEXT: buffer_store_dword v2, v16, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v27 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa0, v0 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x9c, v0 -; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v26 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x98, v0 -; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x94, v0 -; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v25 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x98, v0 +; GFX7-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x94, v0 +; GFX7-NEXT: buffer_store_dword v2, v16, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v25 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x90, v0 ; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x8c, v0 -; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x88, v0 ; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x88, v0 +; GFX7-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen ; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x84, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x80, v0 ; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x80, v0 -; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v16 ; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v34 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; GFX7-NEXT: buffer_store_dword v1, v18, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x7c, v0 -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x74, v0 -; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0 -; GFX7-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX7-NEXT: buffer_store_dword v2, v18, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; GFX7-NEXT: v_add_i32_e32 v14, vcc, 0x78, v0 +; GFX7-NEXT: buffer_store_dword v1, v14, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; GFX7-NEXT: v_add_i32_e32 v14, vcc, 0x74, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: buffer_store_dword v2, v14, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x70, v0 +; GFX7-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x6c, v0 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v11 +; GFX7-NEXT: buffer_store_dword v2, v15, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; GFX7-NEXT: v_add_i32_e32 v11, vcc, 0x68, v0 +; GFX7-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x70, v0 -; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v13 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v13 -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x6c, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: buffer_store_dword v14, v19, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v14, vcc, 0x68, v0 -; GFX7-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x64, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v12, vcc, 0x60, v0 -; GFX7-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v11, vcc, 0x64, v0 +; GFX7-NEXT: buffer_store_dword v2, v11, s[0:3], 0 offen ; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x5c, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x58, v0 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v7 -; GFX7-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x60, v0 +; GFX7-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x5c, v0 +; GFX7-NEXT: buffer_store_dword v2, v9, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 +; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 -; GFX7-NEXT: buffer_store_dword v20, v7, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 -; GFX7-NEXT: buffer_store_dword v19, v7, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 -; GFX7-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: buffer_store_dword v2, v7, s[0:3], 0 offen +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 -; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v10 -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 ; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v4, vcc, 64, v0 -; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; GFX7-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; GFX7-NEXT: buffer_store_dword v19, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; GFX7-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; GFX7-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; GFX7-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX7-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 -; GFX7-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; GFX7-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GFX7-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 8, v0 -; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen -; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; GFX7-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; GFX7-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; GFX7-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; GFX7-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; GFX7-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; GFX7-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; GFX7-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v13 +; GFX7-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; GFX7-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; GFX7-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; GFX7-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX7-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; GFX7-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; GFX7-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -9538,309 +9546,289 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 6, v1 ; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 8, v1 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 10, v1 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 8, v1 ; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 12, v1 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, 10, v1 ; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 14, v1 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 12, v1 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 14, v1 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, 16, v1 ; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 18, v1 +; GFX8-NEXT: flat_load_ushort v10, v[3:4] +; GFX8-NEXT: flat_load_ushort v9, v[5:6] +; GFX8-NEXT: flat_load_ushort v8, v[7:8] +; GFX8-NEXT: flat_load_ushort v7, v[11:12] +; GFX8-NEXT: flat_load_ushort v6, v[13:14] +; GFX8-NEXT: flat_load_ushort v5, v[15:16] +; GFX8-NEXT: flat_load_ushort v4, v[17:18] +; GFX8-NEXT: flat_load_ushort v3, v[19:20] +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 18, v1 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, 20, v1 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 22, v1 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 24, v1 ; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v21, vcc, 20, v1 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, 26, v1 +; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v21, vcc, 28, v1 ; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v23, vcc, 22, v1 +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 30, v1 ; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v25, vcc, 24, v1 +; GFX8-NEXT: flat_load_ushort v11, v[11:12] +; GFX8-NEXT: flat_load_ushort v12, v[13:14] +; GFX8-NEXT: flat_load_ushort v13, v[15:16] +; GFX8-NEXT: flat_load_ushort v14, v[17:18] +; GFX8-NEXT: flat_load_ushort v15, v[19:20] +; GFX8-NEXT: flat_load_ushort v16, v[21:22] +; GFX8-NEXT: flat_load_ushort v17, v[23:24] +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 34, v1 +; GFX8-NEXT: v_addc_u32_e32 v19, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v20, vcc, 36, v1 +; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v18, v[18:19] +; GFX8-NEXT: flat_load_ushort v19, v[20:21] +; GFX8-NEXT: v_add_u32_e32 v20, vcc, 38, v1 +; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v22, vcc, 40, v1 +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v24, vcc, 42, v1 +; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v26, vcc, 44, v1 +; GFX8-NEXT: v_addc_u32_e32 v27, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v28, vcc, 46, v1 +; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v30, vcc, 48, v1 +; GFX8-NEXT: v_addc_u32_e32 v31, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v20, v[20:21] +; GFX8-NEXT: flat_load_ushort v21, v[22:23] +; GFX8-NEXT: flat_load_ushort v22, v[24:25] +; GFX8-NEXT: flat_load_ushort v27, v[26:27] +; GFX8-NEXT: flat_load_ushort v28, v[28:29] +; GFX8-NEXT: flat_load_ushort v29, v[30:31] +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 50, v1 +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v25, vcc, 52, v1 ; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v27, vcc, 26, v1 -; GFX8-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v29, vcc, 28, v1 -; GFX8-NEXT: v_addc_u32_e32 v30, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v31, vcc, 30, v1 -; GFX8-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v33, vcc, 34, v1 -; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1 -; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1 -; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX8-NEXT: flat_load_ushort v44, v[1:2] -; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1 -; GFX8-NEXT: v_addc_u32_e32 v49, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v50, vcc, 62, v1 -; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v45, v[50:51] -; GFX8-NEXT: v_add_u32_e32 v50, vcc, 60, v1 -; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v46, v[50:51] -; GFX8-NEXT: v_add_u32_e32 v50, vcc, 42, v1 -; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v52, vcc, 58, v1 -; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v47, v[52:53] -; GFX8-NEXT: v_add_u32_e32 v52, vcc, 44, v1 -; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v54, vcc, 56, v1 -; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v56, v[54:55] -; GFX8-NEXT: v_add_u32_e32 v54, vcc, 46, v1 -; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v39, vcc, 54, v1 -; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v57, v[39:40] -; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1 -; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v58, v[39:40] -; GFX8-NEXT: v_add_u32_e32 v40, vcc, 48, v1 -; GFX8-NEXT: v_addc_u32_e32 v41, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v42, vcc, 50, v1 -; GFX8-NEXT: v_addc_u32_e32 v43, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v42, v[42:43] -; GFX8-NEXT: flat_load_ushort v34, v[33:34] -; GFX8-NEXT: flat_load_ushort v36, v[35:36] -; GFX8-NEXT: flat_load_ushort v38, v[37:38] -; GFX8-NEXT: flat_load_ushort v39, v[48:49] -; GFX8-NEXT: flat_load_ushort v48, v[50:51] -; GFX8-NEXT: flat_load_ushort v51, v[52:53] -; GFX8-NEXT: flat_load_ushort v52, v[54:55] -; GFX8-NEXT: flat_load_ushort v53, v[40:41] -; GFX8-NEXT: v_add_u32_e32 v49, vcc, 32, v1 -; GFX8-NEXT: v_addc_u32_e32 v50, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_ushort v37, v[3:4] -; GFX8-NEXT: flat_load_ushort v35, v[5:6] -; GFX8-NEXT: flat_load_ushort v33, v[7:8] -; GFX8-NEXT: flat_load_ushort v8, v[9:10] -; GFX8-NEXT: flat_load_ushort v6, v[11:12] -; GFX8-NEXT: flat_load_ushort v4, v[13:14] -; GFX8-NEXT: flat_load_ushort v2, v[15:16] -; GFX8-NEXT: flat_load_ushort v1, v[19:20] -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 4, v0 -; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0x7c, v0 -; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v3 -; GFX8-NEXT: flat_load_ushort v3, v[17:18] -; GFX8-NEXT: flat_load_ushort v5, v[21:22] -; GFX8-NEXT: flat_load_ushort v7, v[23:24] -; GFX8-NEXT: flat_load_ushort v9, v[25:26] -; GFX8-NEXT: flat_load_ushort v10, v[27:28] -; GFX8-NEXT: flat_load_ushort v11, v[29:30] -; GFX8-NEXT: flat_load_ushort v12, v[31:32] -; GFX8-NEXT: flat_load_ushort v13, v[49:50] -; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0x84, v0 -; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xfc, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v45 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 -; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v46 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf8, v0 -; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf4, v0 -; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v47 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xf0, v0 -; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xec, v0 -; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xe8, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v56 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 -; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe4, v0 -; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe0, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v57 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 -; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xdc, v0 -; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v58 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd8, v0 -; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xd4, v0 -; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v42 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd0, v0 -; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xcc, v0 -; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen +; GFX8-NEXT: flat_load_ushort v30, v[23:24] +; GFX8-NEXT: flat_load_ushort v31, v[25:26] +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 54, v1 +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v25, vcc, 56, v1 +; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v32, v[23:24] +; GFX8-NEXT: flat_load_ushort v25, v[25:26] +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 58, v1 +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v26, v[23:24] +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 60, v1 +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v33, v[23:24] +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 62, v1 +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v23, v[23:24] +; GFX8-NEXT: flat_load_ushort v24, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_ushort v34, v[1:2] ; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v53 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc8, v0 -; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc4, v0 -; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v52 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xc0, v0 -; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xbc, v0 -; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v51 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xb8, v0 -; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb4, v0 -; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v48 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb0, v0 -; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xac, v0 -; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa8, v0 -; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xa4, v0 -; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v38 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa0, v0 -; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x9c, v0 -; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v36 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x98, v0 -; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x94, v0 -; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x90, v0 -; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x8c, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v37 -; GFX8-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x88, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX8-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v16 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13 -; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v35 -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX8-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x80, v0 -; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 -; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x78, v0 -; GFX8-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v18 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v11 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x74, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x70, v0 -; GFX8-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v8 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x6c, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x68, v0 -; GFX8-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x64, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x60, v0 -; GFX8-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x5c, v0 -; GFX8-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x58, v0 -; GFX8-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX8-NEXT: s_waitcnt vmcnt(13) +; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v24 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x54, v0 -; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; GFX8-NEXT: v_add_u32_e32 v24, vcc, 4, v0 +; GFX8-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xfc, v0 +; GFX8-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xf8, v0 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v3 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0 -; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xf4, v0 +; GFX8-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xf0, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xec, v0 +; GFX8-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v23 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xe8, v0 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe4, v0 +; GFX8-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xe0, v0 +; GFX8-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xdc, v0 +; GFX8-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v31 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v23 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xd8, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xd4, v0 +; GFX8-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xd0, v0 +; GFX8-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xcc, v0 +; GFX8-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v29 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v23 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xc8, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xc4, v0 +; GFX8-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xc0, v0 +; GFX8-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xbc, v0 +; GFX8-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v27 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v23 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xb8, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xb4, v0 +; GFX8-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xb0, v0 +; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xac, v0 +; GFX8-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[21:22], v21 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xa8, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xa4, v0 +; GFX8-NEXT: buffer_store_dword v22, v1, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xa0, v0 +; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0x9c, v0 +; GFX8-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[19:20], v19 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x98, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x94, v0 +; GFX8-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x90, v0 +; GFX8-NEXT: buffer_store_dword v19, v2, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0x8c, v0 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[20:21], v8 +; GFX8-NEXT: buffer_store_dword v2, v18, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v10 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x88, v0 +; GFX8-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x84, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x78, v0 +; GFX8-NEXT: buffer_store_dword v2, v10, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x80, v0 +; GFX8-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x7c, v0 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; GFX8-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; GFX8-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x74, v0 +; GFX8-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x70, v0 +; GFX8-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x6c, v0 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 +; GFX8-NEXT: buffer_store_dword v2, v15, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x68, v0 +; GFX8-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x64, v0 +; GFX8-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x60, v0 +; GFX8-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x5c, v0 +; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 +; GFX8-NEXT: buffer_store_dword v2, v13, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[13:14], v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x58, v0 +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x54, v0 +; GFX8-NEXT: buffer_store_dword v2, v11, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x50, v0 +; GFX8-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen +; GFX8-NEXT: v_cvt_f64_f32_e32 v[11:12], v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x4c, v0 +; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x48, v0 +; GFX8-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0 -; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 64, v0 -; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 60, v0 -; GFX8-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0 -; GFX8-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 52, v0 -; GFX8-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v0 -; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 44, v0 -; GFX8-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 40, v0 -; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 36, v0 -; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0 -; GFX8-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0 -; GFX8-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 24, v0 -; GFX8-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 20, v0 -; GFX8-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0 -; GFX8-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0 -; GFX8-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -9893,142 +9881,144 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v12 ; GFX900-NEXT: s_waitcnt vmcnt(29) ; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 ; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:244 ; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:240 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 ; GFX900-NEXT: s_waitcnt vmcnt(30) -; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX900-NEXT: s_waitcnt vmcnt(29) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX900-NEXT: s_waitcnt vmcnt(28) +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:236 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[10:11], v11 ; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:232 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v12 -; GFX900-NEXT: s_waitcnt vmcnt(31) -; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; GFX900-NEXT: s_waitcnt vmcnt(30) -; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:228 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX900-NEXT: s_waitcnt vmcnt(29) +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:224 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v16 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[16:17], v17 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:228 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:216 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v18 ; GFX900-NEXT: s_waitcnt vmcnt(31) -; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v14 -; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 -; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v15 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[15:16], v16 -; GFX900-NEXT: s_waitcnt vmcnt(32) -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX900-NEXT: s_waitcnt vmcnt(30) -; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v20 ; GFX900-NEXT: s_waitcnt vmcnt(28) -; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204 -; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:200 -; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:196 -; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:192 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v21 -; GFX900-NEXT: s_waitcnt vmcnt(33) +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; GFX900-NEXT: s_waitcnt vmcnt(27) +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:220 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[18:19], v19 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:208 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v2 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:204 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:200 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v22 +; GFX900-NEXT: s_waitcnt vmcnt(30) ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v19 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[19:20], v20 -; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:188 -; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:184 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:180 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:176 -; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:172 -; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:168 -; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:164 -; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:160 -; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:156 -; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152 -; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:148 +; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:212 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:196 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[22:23], v23 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:192 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:188 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:184 +; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:180 +; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:176 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:172 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:168 +; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:164 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:160 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:156 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:152 +; GFX900-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:148 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:144 -; GFX900-NEXT: s_waitcnt vmcnt(44) +; GFX900-NEXT: s_waitcnt vmcnt(43) ; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v25 +; GFX900-NEXT: s_waitcnt vmcnt(42) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[12:13], v2 ; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:140 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[10:11], v11 ; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:136 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 -; GFX900-NEXT: s_waitcnt vmcnt(38) -; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:132 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:128 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 -; GFX900-NEXT: s_waitcnt vmcnt(38) -; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:124 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:120 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v15 -; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v29 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:116 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:112 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v17 -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v2 +; GFX900-NEXT: s_waitcnt vmcnt(37) +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v28 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[14:15], v9 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:132 +; GFX900-NEXT: s_waitcnt vmcnt(36) +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v30 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v2 -; GFX900-NEXT: s_waitcnt vmcnt(41) -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:124 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[15:16], v11 +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v29 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[19:20], v11 ; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[15:16], v2 -; GFX900-NEXT: s_waitcnt vmcnt(40) -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:108 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:104 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:128 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: s_waitcnt vmcnt(37) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:108 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[20:21], v8 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[17:18], v2 -; GFX900-NEXT: s_waitcnt vmcnt(41) +; GFX900-NEXT: s_waitcnt vmcnt(37) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:120 +; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:112 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[14:15], v2 +; GFX900-NEXT: s_waitcnt vmcnt(38) ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[19:20], v2 -; GFX900-NEXT: s_waitcnt vmcnt(40) +; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:144 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[22:23], v2 +; GFX900-NEXT: s_waitcnt vmcnt(38) ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100 -; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[5:6], v2 -; GFX900-NEXT: s_waitcnt vmcnt(41) -; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX900-NEXT: s_waitcnt vmcnt(40) -; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:104 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[24:25], v6 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:96 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[19:20], v2 +; GFX900-NEXT: s_waitcnt vmcnt(39) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 +; GFX900-NEXT: s_waitcnt vmcnt(38) +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v1 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 -; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[7:8], v2 ; GFX900-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 ; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84 ; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[5:6], v11 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:116 +; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:100 +; GFX900-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 +; GFX900-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 ; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76 ; GFX900-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX900-NEXT: v_cvt_f64_f32_e32 v[1:2], v10 -; GFX900-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 -; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64 -; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60 -; GFX900-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56 -; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52 -; GFX900-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:44 -; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:40 -; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:36 -; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:32 -; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28 -; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24 -; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 -; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 -; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12 -; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 -; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:4 -; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; GFX900-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:68 +; GFX900-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:64 +; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:60 +; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:56 +; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:52 +; GFX900-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:48 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:44 +; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:40 +; GFX900-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:36 +; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:32 +; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:24 +; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:20 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:16 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:4 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10043,8 +10033,8 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse -; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse ; GFX950-NEXT: global_load_ushort v1, v[2:3], off offset:2 ; GFX950-NEXT: global_load_ushort v4, v[2:3], off offset:12 ; GFX950-NEXT: global_load_ushort v5, v[2:3], off offset:8 @@ -10070,18 +10060,21 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX950-NEXT: global_load_ushort v25, v[2:3], off offset:42 ; GFX950-NEXT: global_load_ushort v26, v[2:3], off offset:46 ; GFX950-NEXT: global_load_ushort v42, v[2:3], off offset:50 -; GFX950-NEXT: global_load_ushort v43, v[2:3], off offset:62 -; GFX950-NEXT: global_load_ushort v46, v[2:3], off offset:60 -; GFX950-NEXT: global_load_ushort v47, v[2:3], off offset:56 -; GFX950-NEXT: global_load_ushort v60, v[2:3], off offset:52 +; GFX950-NEXT: global_load_ushort v43, v[2:3], off offset:60 +; GFX950-NEXT: global_load_ushort v46, v[2:3], off offset:56 +; GFX950-NEXT: global_load_ushort v47, v[2:3], off offset:52 ; GFX950-NEXT: global_load_ushort v56, v[2:3], off offset:48 -; GFX950-NEXT: global_load_ushort v57, v[2:3], off offset:54 -; GFX950-NEXT: global_load_ushort v58, v[2:3], off offset:58 +; GFX950-NEXT: global_load_ushort v57, v[2:3], off offset:62 +; GFX950-NEXT: global_load_ushort v60, v[2:3], off offset:54 +; GFX950-NEXT: global_load_ushort v61, v[2:3], off offset:58 ; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a15, v63 ; Reload Reuse ; GFX950-NEXT: s_waitcnt vmcnt(31) ; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX950-NEXT: s_waitcnt vmcnt(30) @@ -10128,37 +10121,34 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v26 ; GFX950-NEXT: s_waitcnt vmcnt(7) ; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v42 -; GFX950-NEXT: s_waitcnt vmcnt(6) -; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v43 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[18:19], v32 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[24:25], v33 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[26:27], v36 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[32:33], v37 +; GFX950-NEXT: s_waitcnt vmcnt(2) +; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v57 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[30:31], v38 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[36:37], v39 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[38:39], v44 -; GFX950-NEXT: v_cvt_f64_f32_e32 v[44:45], v42 -; GFX950-NEXT: s_waitcnt vmcnt(5) -; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v46 -; GFX950-NEXT: v_cvt_f64_f32_e32 v[42:43], v42 -; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v58 -; GFX950-NEXT: scratch_store_dwordx4 v0, v[42:45], off offset:240 -; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v46 -; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v47 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[44:45], v1 ; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v42 +; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v43 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[56:57], v42 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[42:43], v1 -; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; GFX950-NEXT: v_cvt_f64_f32_e32 v[56:57], v46 +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v60 ; GFX950-NEXT: v_lshlrev_b32_e32 v35, 16, v17 ; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v23 ; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v22 -; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:224 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:240 +; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v46 ; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX950-NEXT: v_cvt_f64_f32_e32 v[10:11], v28 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v1 -; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: v_lshlrev_b32_e32 v47, 16, v61 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[10:11], v28 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[16:17], v29 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[22:23], v34 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[28:29], v35 @@ -10168,10 +10158,13 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX950-NEXT: v_cvt_f64_f32_e32 v[54:55], v53 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[52:53], v40 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[40:41], v41 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[62:63], v47 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[60:61], v46 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[56:57], v1 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[8:9], v7 ; GFX950-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[60:63], off offset:224 ; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:208 ; GFX950-NEXT: scratch_store_dwordx4 v0, v[42:45], off offset:192 ; GFX950-NEXT: scratch_store_dwordx4 v0, v[38:41], off offset:176 @@ -10186,6 +10179,9 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX950-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32 ; GFX950-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16 ; GFX950-NEXT: scratch_store_dwordx4 v0, v[2:5], off +; GFX950-NEXT: v_accvgpr_read_b32 v63, a15 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v62, a14 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse ; GFX950-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse @@ -10278,111 +10274,112 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v22 ; GFX10-NEXT: s_waitcnt vmcnt(11) ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX10-NEXT: s_waitcnt vmcnt(10) +; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v24 ; GFX10-NEXT: s_waitcnt vmcnt(9) ; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v25 ; GFX10-NEXT: s_waitcnt vmcnt(8) ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v26 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v27 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 ; GFX10-NEXT: s_waitcnt vmcnt(5) ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v29 ; GFX10-NEXT: s_waitcnt vmcnt(4) ; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v30 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 -; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v31 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v33 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v34 -; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v31 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v28 -; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v24 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v25 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v27 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v23 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v19 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v71 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v68 -; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v16 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v70 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v81 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v80 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v71 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v69 ; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v15 -; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252 -; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:248 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v16 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v37 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v38 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v67 +; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252 +; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:248 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v48 ; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244 ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:240 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v25 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v66 +; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:196 +; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:192 +; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:204 +; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:212 +; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:208 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v70 +; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:200 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v68 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v66 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v49 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v65 +; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:188 +; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:184 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v64 ; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236 ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:232 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v27 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v48 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v50 ; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228 ; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:224 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v81 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v49 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v51 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v55 +; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:180 +; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:176 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v54 ; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 ; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v80 -; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:212 -; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:208 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v69 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v64 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v50 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v51 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v54 -; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:204 -; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:200 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v67 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v39 -; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:196 -; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:192 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v65 -; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:188 -; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:184 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v55 -; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:180 -; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:176 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v53 -; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:172 -; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:168 ; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v52 -; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:164 -; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:160 -; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:156 -; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:152 -; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148 -; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144 -; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:140 -; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:136 -; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:132 -; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:128 -; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:124 -; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:120 -; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:116 -; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112 -; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:108 -; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:104 -; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100 -; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 -; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:92 -; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:88 -; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:84 -; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:80 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[52:53], v53 +; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v39 +; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:172 +; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:168 +; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:164 +; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:160 +; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:156 +; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152 +; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:148 +; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:144 +; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:140 +; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:136 +; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:132 +; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:128 +; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:124 +; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:120 +; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:116 +; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:112 +; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:108 +; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:104 +; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:100 +; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:96 +; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:92 +; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:88 +; GFX10-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:84 +; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:80 ; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:76 ; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:72 -; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:68 -; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:64 -; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:60 -; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:56 -; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:52 -; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:48 -; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44 -; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40 +; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:68 +; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:64 +; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:60 +; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:56 +; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:52 +; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:48 +; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:44 +; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 ; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36 ; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32 ; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28 @@ -10561,112 +10558,112 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX1250-NEXT: global_load_u16 v9, v[2:3], off offset:10 ; GFX1250-NEXT: global_load_u16 v10, v[2:3], off offset:14 ; GFX1250-NEXT: global_load_u16 v11, v[2:3], off offset:18 -; GFX1250-NEXT: global_load_u16 v12, v[2:3], off offset:62 -; GFX1250-NEXT: global_load_u16 v13, v[2:3], off offset:60 -; GFX1250-NEXT: global_load_u16 v14, v[2:3], off offset:58 -; GFX1250-NEXT: global_load_u16 v15, v[2:3], off offset:56 -; GFX1250-NEXT: global_load_u16 v16, v[2:3], off offset:28 -; GFX1250-NEXT: global_load_u16 v17, v[2:3], off offset:24 -; GFX1250-NEXT: global_load_u16 v18, v[2:3], off offset:20 -; GFX1250-NEXT: global_load_u16 v19, v[2:3], off offset:16 -; GFX1250-NEXT: global_load_u16 v20, v[2:3], off offset:22 -; GFX1250-NEXT: global_load_u16 v21, v[2:3], off offset:26 -; GFX1250-NEXT: global_load_u16 v22, v[2:3], off offset:30 -; GFX1250-NEXT: global_load_u16 v23, v[2:3], off offset:34 -; GFX1250-NEXT: global_load_u16 v24, v[2:3], off offset:44 -; GFX1250-NEXT: global_load_u16 v25, v[2:3], off offset:40 -; GFX1250-NEXT: global_load_u16 v26, v[2:3], off offset:36 -; GFX1250-NEXT: global_load_u16 v27, v[2:3], off offset:32 -; GFX1250-NEXT: global_load_u16 v28, v[2:3], off offset:38 -; GFX1250-NEXT: global_load_u16 v29, v[2:3], off offset:42 +; GFX1250-NEXT: global_load_u16 v12, v[2:3], off offset:28 +; GFX1250-NEXT: global_load_u16 v13, v[2:3], off offset:24 +; GFX1250-NEXT: global_load_u16 v14, v[2:3], off offset:20 +; GFX1250-NEXT: global_load_u16 v15, v[2:3], off offset:16 +; GFX1250-NEXT: global_load_u16 v16, v[2:3], off offset:22 +; GFX1250-NEXT: global_load_u16 v17, v[2:3], off offset:26 +; GFX1250-NEXT: global_load_u16 v18, v[2:3], off offset:30 +; GFX1250-NEXT: global_load_u16 v19, v[2:3], off offset:34 +; GFX1250-NEXT: global_load_u16 v20, v[2:3], off offset:44 +; GFX1250-NEXT: global_load_u16 v21, v[2:3], off offset:40 +; GFX1250-NEXT: global_load_u16 v22, v[2:3], off offset:36 +; GFX1250-NEXT: global_load_u16 v23, v[2:3], off offset:32 +; GFX1250-NEXT: global_load_u16 v24, v[2:3], off offset:38 +; GFX1250-NEXT: global_load_u16 v25, v[2:3], off offset:42 +; GFX1250-NEXT: global_load_u16 v26, v[2:3], off offset:62 +; GFX1250-NEXT: global_load_u16 v27, v[2:3], off offset:60 +; GFX1250-NEXT: global_load_u16 v28, v[2:3], off offset:58 +; GFX1250-NEXT: global_load_u16 v29, v[2:3], off offset:56 ; GFX1250-NEXT: global_load_u16 v30, v[2:3], off offset:46 ; GFX1250-NEXT: global_load_u16 v31, v[2:3], off offset:50 ; GFX1250-NEXT: global_load_u16 v32, v[2:3], off offset:52 ; GFX1250-NEXT: global_load_u16 v33, v[2:3], off offset:48 ; GFX1250-NEXT: global_load_u16 v34, v[2:3], off offset:54 ; GFX1250-NEXT: s_wait_loadcnt 0x1e -; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v37, 16, v4 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v98, 16, v4 ; GFX1250-NEXT: s_wait_loadcnt 0x1c -; GFX1250-NEXT: v_dual_lshlrev_b32 v81, 16, v5 :: v_dual_lshlrev_b32 v85, 16, v6 +; GFX1250-NEXT: v_dual_lshlrev_b32 v97, 16, v5 :: v_dual_lshlrev_b32 v36, 16, v6 ; GFX1250-NEXT: s_wait_loadcnt 0x1a -; GFX1250-NEXT: v_dual_lshlrev_b32 v84, 16, v7 :: v_dual_lshlrev_b32 v35, 16, v8 +; GFX1250-NEXT: v_dual_lshlrev_b32 v96, 16, v7 :: v_dual_lshlrev_b32 v35, 16, v8 ; GFX1250-NEXT: s_wait_loadcnt 0x18 -; GFX1250-NEXT: v_dual_lshlrev_b32 v80, 16, v9 :: v_dual_lshlrev_b32 v36, 16, v10 -; GFX1250-NEXT: s_wait_loadcnt 0x15 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v12 :: v_dual_lshlrev_b32 v3, 16, v13 +; GFX1250-NEXT: v_dual_lshlrev_b32 v37, 16, v9 :: v_dual_lshlrev_b32 v38, 16, v10 +; GFX1250-NEXT: s_wait_loadcnt 0x16 +; GFX1250-NEXT: v_dual_lshlrev_b32 v39, 16, v11 :: v_dual_lshlrev_b32 v101, 16, v12 ; GFX1250-NEXT: s_wait_loadcnt 0x14 -; GFX1250-NEXT: v_dual_lshlrev_b32 v12, 16, v11 :: v_dual_lshlrev_b32 v6, 16, v14 -; GFX1250-NEXT: s_wait_loadcnt 0x13 -; GFX1250-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; GFX1250-NEXT: v_dual_lshlrev_b32 v48, 16, v13 :: v_dual_lshlrev_b32 v100, 16, v14 +; GFX1250-NEXT: s_wait_loadcnt 0x12 +; GFX1250-NEXT: v_dual_lshlrev_b32 v99, 16, v15 :: v_dual_lshlrev_b32 v16, 16, v16 +; GFX1250-NEXT: s_wait_loadcnt 0x10 +; GFX1250-NEXT: v_dual_lshlrev_b32 v17, 16, v17 :: v_dual_lshlrev_b32 v49, 16, v18 +; GFX1250-NEXT: s_wait_loadcnt 0xf +; GFX1250-NEXT: v_lshlrev_b32_e32 v50, 16, v19 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[12:13], v36 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[18:19], v37 +; GFX1250-NEXT: s_wait_loadcnt 0xb +; GFX1250-NEXT: v_dual_lshlrev_b32 v52, 16, v22 :: v_dual_lshlrev_b32 v102, 16, v23 +; GFX1250-NEXT: s_wait_loadcnt 0x9 +; GFX1250-NEXT: v_dual_lshlrev_b32 v24, 16, v24 :: v_dual_lshlrev_b32 v25, 16, v25 +; GFX1250-NEXT: s_wait_loadcnt 0x7 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v26 :: v_dual_lshlrev_b32 v3, 16, v27 +; GFX1250-NEXT: s_wait_loadcnt 0x6 +; GFX1250-NEXT: v_dual_lshlrev_b32 v21, 16, v21 :: v_dual_lshlrev_b32 v6, 16, v28 +; GFX1250-NEXT: s_wait_loadcnt 0x5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v7, 16, v29 ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v2 ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 -; GFX1250-NEXT: s_wait_loadcnt 0x11 -; GFX1250-NEXT: v_dual_lshlrev_b32 v68, 16, v17 :: v_dual_lshlrev_b32 v39, 16, v16 -; GFX1250-NEXT: s_wait_loadcnt 0xe -; GFX1250-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v65, 16, v32 :: v_dual_lshlrev_b32 v64, 16, v34 ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 -; GFX1250-NEXT: s_wait_loadcnt 0xc -; GFX1250-NEXT: v_dual_lshlrev_b32 v21, 16, v21 :: v_dual_lshlrev_b32 v38, 16, v22 -; GFX1250-NEXT: s_wait_loadcnt 0x9 -; GFX1250-NEXT: v_dual_lshlrev_b32 v48, 16, v23 :: v_dual_lshlrev_b32 v25, 16, v25 -; GFX1250-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX1250-NEXT: s_wait_loadcnt 0x5 -; GFX1250-NEXT: v_dual_lshlrev_b32 v49, 16, v28 :: v_dual_lshlrev_b32 v64, 16, v29 -; GFX1250-NEXT: s_wait_loadcnt 0x3 -; GFX1250-NEXT: v_dual_lshlrev_b32 v50, 16, v30 :: v_dual_lshlrev_b32 v51, 16, v31 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_dual_lshlrev_b32 v33, 16, v33 :: v_dual_lshlrev_b32 v52, 16, v34 -; GFX1250-NEXT: v_dual_lshlrev_b32 v32, 16, v32 :: v_dual_lshlrev_b32 v69, 16, v27 -; GFX1250-NEXT: v_lshlrev_b32_e32 v70, 16, v26 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[14:15], v35 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[54:55], v52 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[52:53], v32 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[30:31], v38 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[28:29], v39 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[34:35], v48 +; GFX1250-NEXT: v_dual_lshlrev_b32 v29, 16, v31 :: v_dual_lshlrev_b32 v53, 16, v33 +; GFX1250-NEXT: v_dual_lshlrev_b32 v28, 16, v30 :: v_dual_lshlrev_b32 v20, 16, v20 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[86:87], v64 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[84:85], v65 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[82:83], v29 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[80:81], v53 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[70:71], v28 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[68:69], v20 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[66:67], v25 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[64:65], v21 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[54:55], v24 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[52:53], v52 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[22:23], v38 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[26:27], v39 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[32:33], v48 ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[38:39], v49 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[48:49], v33 -; GFX1250-NEXT: v_dual_lshlrev_b32 v13, 16, v19 :: v_dual_lshlrev_b32 v82, 16, v18 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[66:67], v64 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[64:65], v25 -; GFX1250-NEXT: scratch_store_b128 v0, v[2:5], off offset:240 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v50 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[50:51], v51 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v24 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[18:19], v36 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[16:17], v37 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[36:37], v70 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[32:33], v69 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[70:71], v21 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[68:69], v68 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[26:27], v20 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[24:25], v82 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[22:23], v12 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[20:21], v13 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[82:83], v80 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[80:81], v81 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[12:13], v85 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[50:51], v50 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[48:49], v102 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[36:37], v101 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[14:15], v35 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[34:35], v17 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[30:31], v16 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[28:29], v100 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[24:25], v99 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[20:21], v98 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[16:17], v97 ; GFX1250-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: scratch_store_b128 v0, v[2:5], off offset:240 ; GFX1250-NEXT: scratch_store_b128 v0, v[6:9], off offset:224 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v84 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v96 ; GFX1250-NEXT: s_clause 0xd -; GFX1250-NEXT: scratch_store_b128 v0, v[52:55], off offset:208 -; GFX1250-NEXT: scratch_store_b128 v0, v[48:51], off offset:192 -; GFX1250-NEXT: scratch_store_b128 v0, v[2:5], off offset:176 +; GFX1250-NEXT: scratch_store_b128 v0, v[84:87], off offset:208 +; GFX1250-NEXT: scratch_store_b128 v0, v[80:83], off offset:192 +; GFX1250-NEXT: scratch_store_b128 v0, v[68:71], off offset:176 ; GFX1250-NEXT: scratch_store_b128 v0, v[64:67], off offset:160 -; GFX1250-NEXT: scratch_store_b128 v0, v[36:39], off offset:144 -; GFX1250-NEXT: scratch_store_b128 v0, v[32:35], off offset:128 -; GFX1250-NEXT: scratch_store_b128 v0, v[28:31], off offset:112 -; GFX1250-NEXT: scratch_store_b128 v0, v[68:71], off offset:96 -; GFX1250-NEXT: scratch_store_b128 v0, v[24:27], off offset:80 -; GFX1250-NEXT: scratch_store_b128 v0, v[20:23], off offset:64 -; GFX1250-NEXT: scratch_store_b128 v0, v[16:19], off offset:48 -; GFX1250-NEXT: scratch_store_b128 v0, v[80:83], off offset:32 +; GFX1250-NEXT: scratch_store_b128 v0, v[52:55], off offset:144 +; GFX1250-NEXT: scratch_store_b128 v0, v[48:51], off offset:128 +; GFX1250-NEXT: scratch_store_b128 v0, v[36:39], off offset:112 +; GFX1250-NEXT: scratch_store_b128 v0, v[32:35], off offset:96 +; GFX1250-NEXT: scratch_store_b128 v0, v[28:31], off offset:80 +; GFX1250-NEXT: scratch_store_b128 v0, v[24:27], off offset:64 +; GFX1250-NEXT: scratch_store_b128 v0, v[20:23], off offset:48 +; GFX1250-NEXT: scratch_store_b128 v0, v[16:19], off offset:32 ; GFX1250-NEXT: scratch_store_b128 v0, v[12:15], off offset:16 ; GFX1250-NEXT: scratch_store_b128 v0, v[8:11], off ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll index ab2ad19d0f1bf..42d2043740bdd 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -4,16 +4,20 @@ define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 { ; CHECK-LABEL: spill: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s27, s[8:9], 0x2 ; CHECK-NEXT: s_mov_b64 s[98:99], s[2:3] ; CHECK-NEXT: s_mov_b64 s[96:97], s[0:1] +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x2 +; CHECK-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; CHECK-NEXT: s_add_u32 s96, s96, s15 ; CHECK-NEXT: s_addc_u32 s97, s97, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_cmp_eq_u32 s27, 0 +; CHECK-NEXT: v_writelane_b32 v0, s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 1 +; CHECK-NEXT: v_readlane_b32 s0, v0, 0 +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: ;;#ASMEND @@ -326,9 +330,9 @@ define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 { ; CHECK-NEXT: s_cbranch_scc0 .LBB0_1 ; CHECK-NEXT: ; %bb.3: ; %entry ; CHECK-NEXT: s_not_b64 exec, exec -; CHECK-NEXT: buffer_store_dword v0, off, s[96:99], 0 -; CHECK-NEXT: v_writelane_b32 v0, s0, 0 -; CHECK-NEXT: v_writelane_b32 v0, s1, 1 +; CHECK-NEXT: buffer_store_dword v1, off, s[96:99], 0 +; CHECK-NEXT: v_writelane_b32 v1, s0, 0 +; CHECK-NEXT: v_writelane_b32 v1, s1, 1 ; CHECK-NEXT: s_getpc_b64 s[0:1] ; CHECK-NEXT: .Lpost_getpc0: ; CHECK-NEXT: s_add_u32 s0, s0, (.LBB0_4-.Lpost_getpc0)&4294967295 @@ -347,11 +351,12 @@ define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 { ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_branch .LBB0_2 ; CHECK-NEXT: .LBB0_4: ; %bb3 -; CHECK-NEXT: v_readlane_b32 s0, v0, 0 -; CHECK-NEXT: v_readlane_b32 s1, v0, 1 -; CHECK-NEXT: buffer_load_dword v0, off, s[96:99], 0 +; CHECK-NEXT: v_readlane_b32 s0, v1, 0 +; CHECK-NEXT: v_readlane_b32 s1, v1, 1 +; CHECK-NEXT: buffer_load_dword v1, off, s[96:99], 0 ; CHECK-NEXT: s_not_b64 exec, exec ; CHECK-NEXT: .LBB0_2: ; %bb3 +; CHECK-NEXT: v_readlane_b32 s0, v0, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; reg use s0 ; CHECK-NEXT: ;;#ASMEND @@ -900,8 +905,11 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: s_waitcnt expcnt(0) +; CHECK-NEXT: s_waitcnt expcnt(1) +; CHECK-NEXT: v_writelane_b32 v0, s100, 39 +; CHECK-NEXT: v_writelane_b32 v0, s101, 40 ; CHECK-NEXT: v_writelane_b32 v0, s30, 0 ; CHECK-NEXT: v_writelane_b32 v0, s31, 1 ; CHECK-NEXT: v_writelane_b32 v0, s33, 2 @@ -936,13 +944,11 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v0, s86, 31 ; CHECK-NEXT: v_writelane_b32 v0, s87, 32 ; CHECK-NEXT: v_writelane_b32 v0, s96, 33 +; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v0, s97, 34 +; CHECK-NEXT: s_waitcnt expcnt(0) +; CHECK-NEXT: v_writelane_b32 v1, s12, 0 ; CHECK-NEXT: v_writelane_b32 v0, s98, 35 -; CHECK-NEXT: v_writelane_b32 v0, s99, 36 -; CHECK-NEXT: s_mov_b32 s40, s12 -; CHECK-NEXT: v_writelane_b32 v0, s100, 37 -; CHECK-NEXT: s_cmp_eq_u32 s40, 0 -; CHECK-NEXT: v_writelane_b32 v0, s101, 38 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: ;;#ASMEND @@ -958,6 +964,12 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v1, s4, 1 +; CHECK-NEXT: v_writelane_b32 v0, s99, 36 +; CHECK-NEXT: v_readlane_b32 s4, v1, 0 +; CHECK-NEXT: v_writelane_b32 v0, s100, 37 +; CHECK-NEXT: s_cmp_eq_u32 s4, 0 +; CHECK-NEXT: v_writelane_b32 v0, s101, 38 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: ;;#ASMEND @@ -1258,9 +1270,9 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: s_cbranch_scc0 .LBB1_1 ; CHECK-NEXT: ; %bb.3: ; %entry ; CHECK-NEXT: s_not_b64 exec, exec -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; CHECK-NEXT: v_writelane_b32 v1, s0, 0 -; CHECK-NEXT: v_writelane_b32 v1, s1, 1 +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; CHECK-NEXT: v_writelane_b32 v2, s0, 0 +; CHECK-NEXT: v_writelane_b32 v2, s1, 1 ; CHECK-NEXT: s_getpc_b64 s[0:1] ; CHECK-NEXT: .Lpost_getpc1: ; CHECK-NEXT: s_add_u32 s0, s0, (.LBB1_4-.Lpost_getpc1)&4294967295 @@ -1279,9 +1291,9 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_branch .LBB1_2 ; CHECK-NEXT: .LBB1_4: ; %bb3 -; CHECK-NEXT: v_readlane_b32 s0, v1, 0 -; CHECK-NEXT: v_readlane_b32 s1, v1, 1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; CHECK-NEXT: v_readlane_b32 s0, v2, 0 +; CHECK-NEXT: v_readlane_b32 s1, v2, 1 +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; CHECK-NEXT: s_not_b64 exec, exec ; CHECK-NEXT: .LBB1_2: ; %bb3 ; CHECK-NEXT: ;;#ASMSTART @@ -1296,6 +1308,7 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; reg use s3 ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s4, v1, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; reg use s4 ; CHECK-NEXT: ;;#ASMEND @@ -1590,14 +1603,14 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; reg use s101 ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s101, v0, 38 +; CHECK-NEXT: v_readlane_b32 s100, v0, 37 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; reg use vcc_lo ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; reg use vcc_hi ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s101, v0, 38 -; CHECK-NEXT: v_readlane_b32 s100, v0, 37 ; CHECK-NEXT: v_readlane_b32 s99, v0, 36 ; CHECK-NEXT: v_readlane_b32 s98, v0, 35 ; CHECK-NEXT: v_readlane_b32 s97, v0, 34 @@ -1635,8 +1648,11 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s33, v0, 2 ; CHECK-NEXT: v_readlane_b32 s31, v0, 1 ; CHECK-NEXT: v_readlane_b32 s30, v0, 0 +; CHECK-NEXT: v_readlane_b32 s100, v0, 39 +; CHECK-NEXT: v_readlane_b32 s101, v0, 40 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index 361bc78759bfa..d3c3db47782b2 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -2606,42 +2606,46 @@ define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) { ; SDAG-LABEL: load_v32i8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: buffer_load_dwordx4 v[36:39], off, s[16:19], 0 -; SDAG-NEXT: buffer_load_dwordx4 v[32:35], off, s[16:19], 0 offset:16 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 +; SDAG-NEXT: buffer_load_dwordx4 v[16:19], off, s[16:19], 0 offset:16 ; SDAG-NEXT: s_waitcnt vmcnt(1) -; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[36:37] -; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[38:39] -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; SDAG-NEXT: v_lshrrev_b64 v[27:28], 24, v[34:35] -; SDAG-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v37 -; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v37 -; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v37 -; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v38 -; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v38 -; SDAG-NEXT: v_lshrrev_b32_e32 v13, 8, v39 -; SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v39 -; SDAG-NEXT: v_lshrrev_b32_e32 v15, 24, v39 -; SDAG-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; SDAG-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; SDAG-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; SDAG-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; SDAG-NEXT: v_lshrrev_b32_e32 v25, 8, v34 -; SDAG-NEXT: v_lshrrev_b32_e32 v26, 16, v34 -; SDAG-NEXT: v_lshrrev_b32_e32 v29, 8, v35 -; SDAG-NEXT: v_lshrrev_b32_e32 v30, 16, v35 -; SDAG-NEXT: v_lshrrev_b32_e32 v31, 24, v35 -; SDAG-NEXT: v_mov_b32_e32 v0, v36 -; SDAG-NEXT: v_mov_b32_e32 v4, v37 -; SDAG-NEXT: v_mov_b32_e32 v8, v38 -; SDAG-NEXT: v_mov_b32_e32 v12, v39 -; SDAG-NEXT: v_mov_b32_e32 v16, v32 -; SDAG-NEXT: v_mov_b32_e32 v20, v33 -; SDAG-NEXT: v_mov_b32_e32 v24, v34 -; SDAG-NEXT: v_mov_b32_e32 v28, v35 +; SDAG-NEXT: v_lshrrev_b64 v[33:34], 24, v[0:1] +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_lshrrev_b64 v[37:38], 24, v[16:17] +; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] +; SDAG-NEXT: v_lshrrev_b64 v[27:28], 24, v[18:19] +; SDAG-NEXT: v_lshrrev_b32_e32 v34, 8, v0 +; SDAG-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; SDAG-NEXT: v_lshrrev_b32_e32 v36, 8, v16 +; SDAG-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SDAG-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; SDAG-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; SDAG-NEXT: v_lshrrev_b32_e32 v21, 8, v17 +; SDAG-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; SDAG-NEXT: v_lshrrev_b32_e32 v23, 24, v17 +; SDAG-NEXT: v_lshrrev_b32_e32 v25, 8, v18 +; SDAG-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; SDAG-NEXT: v_lshrrev_b32_e32 v29, 8, v19 +; SDAG-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; SDAG-NEXT: v_lshrrev_b32_e32 v31, 24, v19 +; SDAG-NEXT: v_mov_b32_e32 v4, v1 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 +; SDAG-NEXT: v_mov_b32_e32 v12, v3 +; SDAG-NEXT: v_mov_b32_e32 v20, v17 +; SDAG-NEXT: v_mov_b32_e32 v24, v18 +; SDAG-NEXT: v_mov_b32_e32 v28, v19 +; SDAG-NEXT: v_mov_b32_e32 v1, v34 +; SDAG-NEXT: v_mov_b32_e32 v2, v35 +; SDAG-NEXT: v_mov_b32_e32 v17, v36 +; SDAG-NEXT: v_mov_b32_e32 v18, v32 +; SDAG-NEXT: v_mov_b32_e32 v3, v33 +; SDAG-NEXT: v_mov_b32_e32 v19, v37 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: load_v32i8: @@ -2747,23 +2751,23 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) { ; GISEL-LABEL: store_v32i8: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 ; GISEL-NEXT: v_mov_b32_e32 v31, 8 ; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GISEL-NEXT: v_mov_b32_e32 v32, 0xff ; GISEL-NEXT: v_and_or_b32 v0, v0, v32, v1 +; GISEL-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GISEL-NEXT: v_or3_b32 v0, v0, v1, v2 ; GISEL-NEXT: v_lshlrev_b32_sdwa v1, v31, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v7 -; GISEL-NEXT: buffer_load_ubyte v7, off, s[0:3], s32 +; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v7 ; GISEL-NEXT: v_and_or_b32 v1, v4, v32, v1 -; GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v6 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GISEL-NEXT: v_or3_b32 v0, v0, v2, v3 -; GISEL-NEXT: v_or3_b32 v1, v1, v4, v5 +; GISEL-NEXT: v_or3_b32 v1, v1, v2, v3 ; GISEL-NEXT: v_lshlrev_b32_sdwa v2, v31, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v10 ; GISEL-NEXT: v_and_b32_e32 v4, 0xff, v11 @@ -2784,28 +2788,28 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: v_and_or_b32 v4, v16, v32, v4 ; GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GISEL-NEXT: v_lshlrev_b32_sdwa v8, v31, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GISEL-NEXT: v_or3_b32 v4, v4, v5, v6 -; GISEL-NEXT: v_and_b32_e32 v5, 0xff, v22 -; GISEL-NEXT: v_and_b32_e32 v6, 0xff, v23 -; GISEL-NEXT: v_and_or_b32 v8, v20, v32, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GISEL-NEXT: v_or3_b32 v5, v8, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_sdwa v5, v31, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-NEXT: v_and_b32_e32 v6, 0xff, v22 +; GISEL-NEXT: v_and_b32_e32 v7, 0xff, v23 +; GISEL-NEXT: v_and_or_b32 v5, v20, v32, v5 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GISEL-NEXT: v_or3_b32 v5, v5, v6, v7 ; GISEL-NEXT: v_lshlrev_b32_sdwa v6, v31, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GISEL-NEXT: v_and_b32_e32 v8, 0xff, v26 -; GISEL-NEXT: v_and_b32_e32 v9, 0xff, v27 +; GISEL-NEXT: v_and_b32_e32 v7, 0xff, v26 +; GISEL-NEXT: v_and_b32_e32 v8, 0xff, v27 ; GISEL-NEXT: v_and_or_b32 v6, v24, v32, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GISEL-NEXT: v_or3_b32 v6, v6, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_sdwa v7, v31, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-NEXT: v_and_b32_e32 v8, 0xff, v30 +; GISEL-NEXT: v_and_or_b32 v7, v28, v32, v7 ; GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GISEL-NEXT: v_or3_b32 v6, v6, v8, v9 -; GISEL-NEXT: v_lshlrev_b32_sdwa v8, v31, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GISEL-NEXT: v_and_b32_e32 v9, 0xff, v30 -; GISEL-NEXT: v_and_or_b32 v8, v28, v32, v8 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GISEL-NEXT: v_or3_b32 v7, v8, v9, v7 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 24, v33 +; GISEL-NEXT: v_or3_b32 v7, v7, v8, v9 ; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16 ; GISEL-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index 931a62298812f..7341568efae9c 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -275,42 +275,42 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v0, s[4:7], 0 offen offset:208 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v0, s[4:7], 0 offen offset:224 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v0, s[4:7], 0 offen offset:240 -; SDAG-GFX942-NEXT: s_add_i32 s1, s8, s16 +; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16 ; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100 -; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x2000 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[12:15], 0 offen ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v0, s[12:15], 0 offen offset:16 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v1, s[12:15], 0 offen offset:16 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v0, s[12:15], 0 offen offset:32 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v1, s[12:15], 0 offen offset:32 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v0, s[12:15], 0 offen offset:48 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v1, s[12:15], 0 offen offset:48 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v0, s[12:15], 0 offen offset:64 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v1, s[12:15], 0 offen offset:64 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v0, s[12:15], 0 offen offset:80 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v1, s[12:15], 0 offen offset:80 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v0, s[12:15], 0 offen offset:96 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v1, s[12:15], 0 offen offset:96 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v0, s[12:15], 0 offen offset:112 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v1, s[12:15], 0 offen offset:112 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v0, s[12:15], 0 offen offset:128 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v1, s[12:15], 0 offen offset:128 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v0, s[12:15], 0 offen offset:144 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v1, s[12:15], 0 offen offset:144 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v0, s[12:15], 0 offen offset:160 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v1, s[12:15], 0 offen offset:160 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v0, s[12:15], 0 offen offset:176 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v1, s[12:15], 0 offen offset:176 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v0, s[12:15], 0 offen offset:192 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v1, s[12:15], 0 offen offset:192 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v0, s[12:15], 0 offen offset:208 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v1, s[12:15], 0 offen offset:208 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v0, s[12:15], 0 offen offset:224 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v1, s[12:15], 0 offen offset:224 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v1, s[12:15], 0 offen offset:240 ; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB0_1 ; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split ; SDAG-GFX942-NEXT: s_endpgm @@ -807,42 +807,42 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v0, s[4:7], 0 offen offset:208 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v0, s[4:7], 0 offen offset:224 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v0, s[4:7], 0 offen offset:240 -; SDAG-GFX942-NEXT: s_add_i32 s1, s8, s16 +; SDAG-GFX942-NEXT: s_add_i32 s2, s8, s16 ; SDAG-GFX942-NEXT: s_addk_i32 s16, 0x100 -; SDAG-GFX942-NEXT: v_mov_b32_e32 v0, s1 +; SDAG-GFX942-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-GFX942-NEXT: s_cmpk_lt_u32 s16, 0x100 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v1, s[12:15], 0 offen ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v0, s[12:15], 0 offen offset:16 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v1, s[12:15], 0 offen offset:16 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v0, s[12:15], 0 offen offset:32 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v1, s[12:15], 0 offen offset:32 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v0, s[12:15], 0 offen offset:48 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v1, s[12:15], 0 offen offset:48 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v0, s[12:15], 0 offen offset:64 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v1, s[12:15], 0 offen offset:64 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v0, s[12:15], 0 offen offset:80 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v1, s[12:15], 0 offen offset:80 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v0, s[12:15], 0 offen offset:96 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v1, s[12:15], 0 offen offset:96 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v0, s[12:15], 0 offen offset:112 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v1, s[12:15], 0 offen offset:112 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v0, s[12:15], 0 offen offset:128 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v1, s[12:15], 0 offen offset:128 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v0, s[12:15], 0 offen offset:144 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v1, s[12:15], 0 offen offset:144 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v0, s[12:15], 0 offen offset:160 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v1, s[12:15], 0 offen offset:160 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v0, s[12:15], 0 offen offset:176 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v1, s[12:15], 0 offen offset:176 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v0, s[12:15], 0 offen offset:192 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v1, s[12:15], 0 offen offset:192 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v0, s[12:15], 0 offen offset:208 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v1, s[12:15], 0 offen offset:208 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v0, s[12:15], 0 offen offset:224 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v1, s[12:15], 0 offen offset:224 ; SDAG-GFX942-NEXT: s_waitcnt vmcnt(15) -; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v0, s[12:15], 0 offen offset:240 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 a[0:3], v1, s[12:15], 0 offen offset:240 ; SDAG-GFX942-NEXT: s_cbranch_scc1 .LBB1_1 ; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split ; SDAG-GFX942-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index c407f7645315d..f34443a3e81d9 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -5165,98 +5165,98 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; VI-LABEL: test_call_external_void_func_v32i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 -; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[8:9] -; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32 -; VI-NEXT: s_swappc_b64 s[30:31], s[8:9] +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; ; CI-LABEL: test_call_external_void_func_v32i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 -; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 -; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[8:9] -; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32 -; CI-NEXT: s_swappc_b64 s[30:31], s[8:9] +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; ; SDAG-LABEL: test_call_external_void_func_v32i32: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SDAG-NEXT: s_mov_b32 s6, -1 ; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; SDAG-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 -; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SDAG-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; SDAG-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; SDAG-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 -; SDAG-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 ; SDAG-NEXT: s_mov_b32 s38, -1 ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s3 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SDAG-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SDAG-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SDAG-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; SDAG-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; SDAG-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] ; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b32 s32, 0 -; SDAG-NEXT: s_getpc_b64 s[8:9] -; SDAG-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 -; SDAG-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; SDAG-NEXT: s_getpc_b64 s[4:5] +; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 +; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] -; SDAG-NEXT: s_waitcnt vmcnt(6) +; SDAG-NEXT: s_waitcnt vmcnt(7) ; SDAG-NEXT: buffer_store_dword v31, off, s[36:39], s32 -; SDAG-NEXT: s_swappc_b64 s[30:31], s[8:9] +; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v32i32: @@ -5304,13 +5304,13 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 ; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_getpc_b64 s[12:13] -; HSA-NEXT: s_add_u32 s12, s12, external_void_func_v32i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s13, s13, external_void_func_v32i32@rel32@hi+12 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_waitcnt vmcnt(7) ; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 -; HSA-NEXT: s_swappc_b64 s[30:31], s[12:13] +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm ; ; GISEL-LABEL: test_call_external_void_func_v32i32: @@ -5354,7 +5354,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GISEL-NEXT: v_mov_b32_e32 v17, s9 ; GISEL-NEXT: v_mov_b32_e32 v18, s10 ; GISEL-NEXT: v_mov_b32_e32 v19, s11 -; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55] ; GISEL-NEXT: v_mov_b32_e32 v20, s12 ; GISEL-NEXT: v_mov_b32_e32 v21, s13 ; GISEL-NEXT: v_mov_b32_e32 v22, s14 @@ -5366,6 +5365,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GISEL-NEXT: v_mov_b32_e32 v28, s20 ; GISEL-NEXT: v_mov_b32_e32 v29, s21 ; GISEL-NEXT: v_mov_b32_e32 v30, s22 +; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55] ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GISEL-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) poison @@ -5377,26 +5377,26 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; VI-LABEL: test_call_external_void_func_v32i32_i32: ; VI: ; %bb.0: +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], 0 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s5 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_addc_u32 s37, s37, 0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[4:7], 0 -; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 -; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] @@ -5404,34 +5404,34 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; ; CI-LABEL: test_call_external_void_func_v32i32_i32: ; CI: ; %bb.0: +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], 0 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s5 -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_addc_u32 s37, s37, 0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dword v32, off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 -; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 -; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] @@ -5439,34 +5439,34 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_waitcnt vmcnt(8) -; CI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 -; CI-NEXT: s_waitcnt vmcnt(8) ; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; ; SDAG-LABEL: test_call_external_void_func_v32i32_i32: ; SDAG: ; %bb.0: +; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SDAG-NEXT: s_mov_b32 s2, -1 ; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; SDAG-NEXT: buffer_load_dword v32, off, s[0:3], 0 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SDAG-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SDAG-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SDAG-NEXT: buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; SDAG-NEXT: buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; SDAG-NEXT: buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96 ; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; SDAG-NEXT: s_mov_b32 s38, -1 ; SDAG-NEXT: s_mov_b32 s39, 0xe00000 ; SDAG-NEXT: s_add_u32 s36, s36, s5 -; SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SDAG-NEXT: s_mov_b32 s6, -1 ; SDAG-NEXT: s_addc_u32 s37, s37, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: buffer_load_dword v32, off, s[4:7], 0 -; SDAG-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 -; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SDAG-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; SDAG-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; SDAG-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 -; SDAG-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; SDAG-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 -; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1] ; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37] ; SDAG-NEXT: s_mov_b32 s32, 0 ; SDAG-NEXT: s_getpc_b64 s[4:5] @@ -5474,9 +5474,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 ; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39] ; SDAG-NEXT: s_waitcnt vmcnt(8) -; SDAG-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 -; SDAG-NEXT: s_waitcnt vmcnt(8) ; SDAG-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; SDAG-NEXT: s_waitcnt vmcnt(8) +; SDAG-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SDAG-NEXT: s_endpgm ; @@ -5519,8 +5519,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; HSA-NEXT: s_mov_b32 s11, 0x1100f000 ; HSA-NEXT: s_mov_b32 s10, -1 ; HSA-NEXT: s_waitcnt lgkmcnt(0) -; HSA-NEXT: buffer_load_dword v32, off, s[8:11], 0 ; HSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; HSA-NEXT: buffer_load_dword v32, off, s[8:11], 0 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 @@ -5535,9 +5535,9 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32_i32@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_waitcnt vmcnt(8) -; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; HSA-NEXT: s_waitcnt vmcnt(8) ; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; HSA-NEXT: s_waitcnt vmcnt(8) +; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm ; @@ -5587,7 +5587,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GISEL-NEXT: v_mov_b32_e32 v17, s9 ; GISEL-NEXT: v_mov_b32_e32 v18, s10 ; GISEL-NEXT: v_mov_b32_e32 v19, s11 -; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55] ; GISEL-NEXT: v_mov_b32_e32 v20, s12 ; GISEL-NEXT: v_mov_b32_e32 v21, s13 ; GISEL-NEXT: v_mov_b32_e32 v22, s14 @@ -5599,6 +5598,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GISEL-NEXT: v_mov_b32_e32 v28, s20 ; GISEL-NEXT: v_mov_b32_e32 v29, s21 ; GISEL-NEXT: v_mov_b32_e32 v30, s22 +; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55] ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GISEL-NEXT: s_endpgm %ptr0 = load ptr addrspace(1), ptr addrspace(4) poison @@ -6908,7 +6908,6 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; GISEL-NEXT: v_mov_b32_e32 v17, s9 ; GISEL-NEXT: v_mov_b32_e32 v18, s10 ; GISEL-NEXT: v_mov_b32_e32 v19, s11 -; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55] ; GISEL-NEXT: v_mov_b32_e32 v20, s12 ; GISEL-NEXT: v_mov_b32_e32 v21, s13 ; GISEL-NEXT: v_mov_b32_e32 v22, s14 @@ -6920,6 +6919,7 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; GISEL-NEXT: v_mov_b32_e32 v28, s20 ; GISEL-NEXT: v_mov_b32_e32 v29, s21 ; GISEL-NEXT: v_mov_b32_e32 v30, s22 +; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55] ; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GISEL-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir index 156979d6d06a5..93470d443409b 100644 --- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir +++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir @@ -25,7 +25,7 @@ body: | ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF @@ -33,9 +33,10 @@ body: | ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF]], implicit $mode, implicit $exec ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF1]], implicit $mode, implicit $exec ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -51,34 +52,33 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]] - ; CHECK-NEXT: [[V_MUL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec - ; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_MUL_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF6]], [[DEF6]], implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MUL_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF12]], implicit $mode, implicit $exec ; CHECK-NEXT: dead [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[V_ADD_F32_e32_]], [[COPY]], [[V_MOV_B32_e32_1]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[DEF13:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MUL_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MUL_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[V_MUL_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_3]], [[DEF13]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[DEF14:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; CHECK-NEXT: $sgpr4 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = COPY [[DEF10]] + ; CHECK-NEXT: $vgpr0 = COPY [[DEF11]] ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]] - ; CHECK-NEXT: $vgpr1 = COPY [[DEF6]] + ; CHECK-NEXT: $vgpr1 = COPY [[DEF7]] ; CHECK-NEXT: $vgpr0 = COPY [[V_MUL_F32_e32_1]] ; CHECK-NEXT: $vgpr1 = COPY [[V_MUL_F32_e32_2]] - ; CHECK-NEXT: $vgpr2 = COPY [[V_MUL_F32_e32_3]] - ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL [[DEF13]], @foo, csr_amdgpu, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0 - ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MUL_F32_e32_]], [[DEF7]], implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF11]], [[DEF8]], [[V_ADD_F32_e32_1]], implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF3]], 0, [[DEF]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: dead [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: GLOBAL_STORE_DWORD [[DEF14]], [[DEF9]], 0, 0, implicit $exec + ; CHECK-NEXT: $vgpr2 = COPY [[V_MUL_F32_e32_5]] + ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL [[DEF14]], @foo, csr_amdgpu, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0 + ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MUL_F32_e32_]], [[DEF8]], implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF12]], [[DEF9]], [[V_ADD_F32_e32_1]], implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index dd9a013d37203..a42d1ff6e6785 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -134,61 +134,61 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v6, v[10:11] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v6, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v15, v7, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v9, v6, v12, vcc ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v22, v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v9, v8, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, -1, v21 ; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v20, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, -1, v0, vcc ; GFX9-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v15, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_lshlrev_b64 v[30:31], 1, v[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GFX9-NEXT: v_or_b32_e32 v4, v14, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 31, v9 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] -; GFX9-NEXT: v_or_b32_e32 v5, v15, v31 ; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 31, v3 -; GFX9-NEXT: v_or_b32_e32 v8, v8, v15 -; GFX9-NEXT: v_or_b32_e32 v10, v10, v14 -; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v26, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v27, v9, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v28, v10, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 31, v15 +; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[14:15] +; GFX9-NEXT: v_or_b32_e32 v6, v10, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 31, v3 +; GFX9-NEXT: v_or_b32_e32 v10, v14, v10 +; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v26, v10 +; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v27, v15, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v28, v6, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v29, v11, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v14 -; GFX9-NEXT: v_and_b32_e32 v14, v30, v21 -; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v14 -; GFX9-NEXT: v_and_b32_e32 v14, v30, v20 -; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v14, vcc -; GFX9-NEXT: v_or3_b32 v2, v2, v6, v12 -; GFX9-NEXT: v_and_b32_e32 v6, v30, v0 -; GFX9-NEXT: v_and_b32_e32 v14, v30, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v6, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v14, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v32, 31, v14 +; GFX9-NEXT: v_and_b32_e32 v14, v32, v21 +; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v10, v14 +; GFX9-NEXT: v_and_b32_e32 v10, v32, v20 +; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, v15, v10, vcc +; GFX9-NEXT: v_and_b32_e32 v10, v32, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v6, v10, vcc +; GFX9-NEXT: v_and_b32_e32 v6, v32, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc -; GFX9-NEXT: v_or_b32_e32 v14, v22, v24 -; GFX9-NEXT: v_or_b32_e32 v15, v23, v25 -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: v_and_b32_e32 v6, 1, v30 -; GFX9-NEXT: v_mov_b32_e32 v15, v7 +; GFX9-NEXT: v_or_b32_e32 v30, v22, v24 +; GFX9-NEXT: v_or_b32_e32 v31, v23, v25 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 31, v5 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[30:31] +; GFX9-NEXT: v_or3_b32 v2, v2, v6, v12 +; GFX9-NEXT: v_or_b32_e32 v5, v9, v5 +; GFX9-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 1, v32 +; GFX9-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-NEXT: v_or3_b32 v3, v3, 0, v13 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB0_3 ; GFX9-NEXT: ; %bb.4: ; %Flow @@ -1145,13 +1145,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_ashrrev_i32_e32 v16, 31, v3 ; GFX9-G-NEXT: v_xor_b32_e32 v0, v16, v0 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v16, v1 -; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v16 +; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v0, v16 ; GFX9-G-NEXT: v_xor_b32_e32 v2, v16, v2 -; GFX9-G-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v1, v16, vcc ; GFX9-G-NEXT: v_ashrrev_i32_e32 v17, 31, v7 ; GFX9-G-NEXT: v_xor_b32_e32 v3, v16, v3 -; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v2, v16, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v3, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v16, vcc ; GFX9-G-NEXT: v_xor_b32_e32 v0, v17, v4 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v17, v5 ; GFX9-G-NEXT: v_sub_co_u32_e32 v18, vcc, v0, v17 @@ -1163,8 +1163,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v0, v18, v4 ; GFX9-G-NEXT: v_or_b32_e32 v1, v19, v5 ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-G-NEXT: v_or_b32_e32 v0, v8, v10 -; GFX9-G-NEXT: v_or_b32_e32 v1, v9, v11 +; GFX9-G-NEXT: v_or_b32_e32 v0, v10, v12 +; GFX9-G-NEXT: v_or_b32_e32 v1, v11, v13 ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v18 ; GFX9-G-NEXT: v_ffbh_u32_e32 v0, v19 @@ -1176,15 +1176,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[4:5] ; GFX9-G-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v8 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v10 ; GFX9-G-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7] -; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v9 +; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v11 ; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v12 ; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v11 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v13 ; GFX9-G-NEXT: v_add_u32_e32 v3, 32, v3 -; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[12:13] ; GFX9-G-NEXT: v_add_u32_e32 v1, 64, v1 ; GFX9-G-NEXT: v_min_u32_e32 v2, v2, v3 ; GFX9-G-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[6:7] @@ -1209,10 +1209,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v14, v6, v2 ; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v20 ; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v9, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v12, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v13, 0, vcc ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14 @@ -1227,23 +1227,23 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v2, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc ; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, 0x7f, v0 -; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v12 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9] -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v12, v[10:11] -; GFX9-G-NEXT: v_add_u32_e32 v13, 0xffffffc0, v12 -; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v12, v[8:9] +; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, 0x7f, v0 +; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v8 +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v8, v[12:13] +; GFX9-G-NEXT: v_add_u32_e32 v9, 0xffffffc0, v8 +; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v8, v[10:11] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v13, v[8:9] -; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v9, v[10:11] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 ; GFX9-G-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 -; GFX9-G-NEXT: v_cndmask_b32_e32 v12, v0, v10, vcc -; GFX9-G-NEXT: v_cndmask_b32_e32 v13, v1, v11, vcc +; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v0, v12, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v1, v13, vcc ; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 @@ -1254,13 +1254,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-G-NEXT: v_sub_u32_e32 v2, 64, v20 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[8:9] -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[10:11] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[12:13] ; GFX9-G-NEXT: v_add_u32_e32 v24, 0xffffffc0, v20 -; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[10:11] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[12:13] ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[12:13] ; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v20 ; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc @@ -1270,54 +1270,54 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20 ; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, v8, s[4:5] -; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, v9, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v0, v10, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v1, v11, s[4:5] ; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc ; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc -; GFX9-G-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-G-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-G-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v8, 31, v7 -; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 -; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[10:11] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v13 -; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15] -; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v11 -; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v24, v2 -; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v14 -; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v25, v3, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v26, v0, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v27, v1, vcc -; GFX9-G-NEXT: v_ashrrev_i32_e32 v28, 31, v10 -; GFX9-G-NEXT: v_and_b32_e32 v10, v28, v18 -; GFX9-G-NEXT: v_and_b32_e32 v11, v28, v19 -; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v2, v10 -; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v3, v11, vcc -; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v4 -; GFX9-G-NEXT: v_and_b32_e32 v3, v28, v5 -; GFX9-G-NEXT: v_subb_co_u32_e32 v14, vcc, v0, v2, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v15, vcc, v1, v3, vcc ; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, -1, v20 ; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v22, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc +; GFX9-G-NEXT: v_or_b32_e32 v2, v20, v22 +; GFX9-G-NEXT: v_or_b32_e32 v3, v21, v23 +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[14:15] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v13 ; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] -; GFX9-G-NEXT: v_or_b32_e32 v0, v20, v22 -; GFX9-G-NEXT: v_or_b32_e32 v1, v21, v23 -; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v8 -; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v28 -; GFX9-G-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v9 +; GFX9-G-NEXT: v_or_b32_e32 v10, v12, v10 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v7 +; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v6 +; GFX9-G-NEXT: v_sub_co_u32_e64 v0, s[4:5], v24, v10 +; GFX9-G-NEXT: v_subb_co_u32_e64 v0, s[4:5], v25, v13, s[4:5] +; GFX9-G-NEXT: v_subb_co_u32_e64 v0, s[4:5], v26, v2, s[4:5] +; GFX9-G-NEXT: v_subb_co_u32_e64 v0, s[4:5], v27, v3, s[4:5] +; GFX9-G-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v7 +; GFX9-G-NEXT: v_and_b32_e32 v1, v0, v18 +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX9-G-NEXT: v_and_b32_e32 v14, v0, v19 +; GFX9-G-NEXT: v_sub_co_u32_e64 v12, s[4:5], v10, v1 +; GFX9-G-NEXT: v_subb_co_u32_e64 v13, s[4:5], v13, v14, s[4:5] +; GFX9-G-NEXT: v_and_b32_e32 v1, v0, v4 +; GFX9-G-NEXT: v_and_b32_e32 v10, v0, v5 +; GFX9-G-NEXT: v_subb_co_u32_e64 v14, s[4:5], v2, v1, s[4:5] +; GFX9-G-NEXT: v_subb_co_u32_e64 v15, s[4:5], v3, v10, s[4:5] +; GFX9-G-NEXT: v_and_b32_e32 v10, 1, v0 +; GFX9-G-NEXT: v_mov_b32_e32 v0, v10 ; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX9-G-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-G-NEXT: v_mov_b32_e32 v1, v11 ; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-G-NEXT: s_cbranch_execnz .LBB0_3 ; GFX9-G-NEXT: ; %bb.4: ; %Flow @@ -1325,9 +1325,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: .LBB0_5: ; %Flow2 ; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] -; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-G-NEXT: v_lshrrev_b32_e32 v4, 31, v7 -; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v4 +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v4 ; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 ; GFX9-G-NEXT: .LBB0_6: ; %Flow3 @@ -1336,9 +1336,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_xor_b32_e32 v0, v6, v3 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v7, v3 ; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 -; GFX9-G-NEXT: v_xor_b32_e32 v2, v12, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v8, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-G-NEXT: v_xor_b32_e32 v4, v13, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v4, v9, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc ; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc ; GFX9-G-NEXT: s_setpc_b64 s[30:31] @@ -2329,63 +2329,63 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subrev_u32_e32 v12, 64, v18 ; GFX9-NEXT: v_or_b32_e32 v15, v13, v15 ; GFX9-NEXT: v_lshrrev_b64 v[12:13], v12, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v18, v[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v17, v13, v1, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[1:2], v18, v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v12, v0, s[4:5] ; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v6, vcc ; GFX9-NEXT: v_mov_b32_e32 v14, 0 -; GFX9-NEXT: v_mov_b32_e32 v16, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v7, vcc ; GFX9-NEXT: v_mov_b32_e32 v15, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v17, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v11 -; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v10, v16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: v_or_b32_e32 v2, v2, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v9 -; GFX9-NEXT: v_or_b32_e32 v0, v0, v16 -; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v22, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v23, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v17 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[16:17] +; GFX9-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v9 +; GFX9-NEXT: v_or_b32_e32 v12, v16, v12 +; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v22, v12 +; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v23, v17, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v24, v2, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v25, v3, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v26, 31, v16 -; GFX9-NEXT: v_and_b32_e32 v16, v26, v4 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v16 -; GFX9-NEXT: v_and_b32_e32 v16, v26, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v16, vcc -; GFX9-NEXT: v_and_b32_e32 v16, v26, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v16, vcc -; GFX9-NEXT: v_and_b32_e32 v16, v26, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v16, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v28, 31, v16 +; GFX9-NEXT: v_and_b32_e32 v16, v28, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v12, v16 +; GFX9-NEXT: v_and_b32_e32 v12, v28, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v17, v12, vcc +; GFX9-NEXT: v_and_b32_e32 v12, v28, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v12, vcc +; GFX9-NEXT: v_and_b32_e32 v12, v28, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, -1, v18 ; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, -1, v19, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v20, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc -; GFX9-NEXT: v_or_b32_e32 v11, v17, v11 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] -; GFX9-NEXT: v_or_b32_e32 v16, v18, v20 -; GFX9-NEXT: v_or_b32_e32 v17, v19, v21 -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GFX9-NEXT: v_or_b32_e32 v26, v18, v20 +; GFX9-NEXT: v_or_b32_e32 v27, v19, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v11 +; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[26:27] ; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 -; GFX9-NEXT: v_and_b32_e32 v12, 1, v26 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_and_b32_e32 v12, 1, v28 +; GFX9-NEXT: v_or_b32_e32 v11, v1, v11 +; GFX9-NEXT: v_or_b32_e32 v10, v0, v10 +; GFX9-NEXT: v_mov_b32_e32 v0, v12 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v16, v12 +; GFX9-NEXT: v_mov_b32_e32 v1, v13 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB1_3 ; GFX9-NEXT: ; %bb.4: ; %Flow @@ -3351,40 +3351,40 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_mov_b32_e32 v12, s10 ; GFX9-G-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[14:15] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v0, 31, v15 -; GFX9-G-NEXT: v_or_b32_e32 v14, v10, v12 -; GFX9-G-NEXT: v_or_b32_e32 v15, v11, v13 -; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[16:17] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v3 -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v12 -; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9 -; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12 -; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v22, v2 -; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v23, v3, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v24, v10, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v25, v11, vcc -; GFX9-G-NEXT: v_ashrrev_i32_e32 v12, 31, v12 -; GFX9-G-NEXT: v_and_b32_e32 v13, v12, v4 -; GFX9-G-NEXT: v_and_b32_e32 v16, v12, v5 -; GFX9-G-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v13 -; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v16, vcc -; GFX9-G-NEXT: v_and_b32_e32 v13, v12, v6 -; GFX9-G-NEXT: v_and_b32_e32 v17, v12, v7 -; GFX9-G-NEXT: v_subb_co_u32_e32 v16, vcc, v10, v13, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v17, vcc, v11, v17, vcc ; GFX9-G-NEXT: v_add_co_u32_e32 v18, vcc, -1, v18 ; GFX9-G-NEXT: v_addc_co_u32_e32 v19, vcc, -1, v19, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v20, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc -; GFX9-G-NEXT: v_or_b32_e32 v10, v18, v20 -; GFX9-G-NEXT: v_or_b32_e32 v11, v19, v21 +; GFX9-G-NEXT: v_or_b32_e32 v12, v18, v20 +; GFX9-G-NEXT: v_or_b32_e32 v13, v19, v21 +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[16:17] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v0, 31, v3 +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX9-G-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9 +; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v15 +; GFX9-G-NEXT: v_lshlrev_b64 v[14:15], 1, v[14:15] ; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] -; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0 -; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v12 +; GFX9-G-NEXT: v_or_b32_e32 v14, v10, v14 +; GFX9-G-NEXT: v_sub_co_u32_e64 v10, s[4:5], v22, v2 +; GFX9-G-NEXT: v_subb_co_u32_e64 v10, s[4:5], v23, v3, s[4:5] +; GFX9-G-NEXT: v_subb_co_u32_e64 v10, s[4:5], v24, v0, s[4:5] +; GFX9-G-NEXT: v_subb_co_u32_e64 v10, s[4:5], v25, v13, s[4:5] +; GFX9-G-NEXT: v_ashrrev_i32_e32 v10, 31, v10 +; GFX9-G-NEXT: v_or_b32_e32 v15, v11, v15 +; GFX9-G-NEXT: v_and_b32_e32 v11, v10, v4 +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX9-G-NEXT: v_and_b32_e32 v12, v10, v5 +; GFX9-G-NEXT: v_sub_co_u32_e64 v2, s[4:5], v2, v11 +; GFX9-G-NEXT: v_subb_co_u32_e64 v3, s[4:5], v3, v12, s[4:5] +; GFX9-G-NEXT: v_and_b32_e32 v11, v10, v6 +; GFX9-G-NEXT: v_and_b32_e32 v12, v10, v7 +; GFX9-G-NEXT: v_subb_co_u32_e64 v16, s[4:5], v0, v11, s[4:5] +; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v10 ; GFX9-G-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-G-NEXT: v_subb_co_u32_e64 v17, s[4:5], v13, v12, s[4:5] ; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v10, v0 ; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index ddac86b3719c2..0210fca28dade 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -7,419 +7,419 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 -; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3 ; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v26, v24 ; SDAG-NEXT: v_mov_b32_e32 v27, v25 -; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v21, v1, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v0, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v19, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v1, v20 -; SDAG-NEXT: v_ffbh_u32_e32 v2, v21 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v0, v20, v16 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v2, v18, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v1, v16 +; SDAG-NEXT: v_ffbh_u32_e32 v2, v17 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v3, v0, s[4:5] ; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v16 -; SDAG-NEXT: v_or_b32_e32 v1, v21, v17 +; SDAG-NEXT: v_or_b32_e32 v0, v16, v18 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v18 ; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc -; SDAG-NEXT: v_min_u32_e32 v2, v19, v2 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v22 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v17 -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7] -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7] -; SDAG-NEXT: v_min_u32_e32 v1, v19, v22 +; SDAG-NEXT: v_or_b32_e32 v1, v17, v19 +; SDAG-NEXT: v_min_u32_e32 v2, v21, v2 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v22 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v19 +; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v10, vcc +; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1] +; SDAG-NEXT: v_min_u32_e32 v1, v21, v22 ; SDAG-NEXT: v_add_i32_e64 v2, s[8:9], 64, v2 -; SDAG-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[6:7] -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc +; SDAG-NEXT: v_addc_u32_e64 v8, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v3, s[4:5] +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v10, v2, v1, vcc ; SDAG-NEXT: v_ffbh_u32_e32 v3, v29 -; SDAG-NEXT: v_ffbh_u32_e32 v19, v28 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v8, s[6:7] +; SDAG-NEXT: v_ffbh_u32_e32 v21, v28 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v9, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v2, v29, v0 -; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v3 +; SDAG-NEXT: v_add_i32_e32 v9, vcc, 32, v3 ; SDAG-NEXT: v_ffbh_u32_e32 v11, v0 ; SDAG-NEXT: v_or_b32_e32 v3, v28, v1 -; SDAG-NEXT: v_min_u32_e32 v8, v8, v19 +; SDAG-NEXT: v_min_u32_e32 v9, v9, v21 ; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11 -; SDAG-NEXT: v_ffbh_u32_e32 v19, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v1 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_min_u32_e32 v2, v11, v19 -; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 64, v8 -; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7] -; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v11, v8, 0, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[6:7] -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] -; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v2, v10 -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v9, vcc -; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v8 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v18, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9] -; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v18, vcc -; SDAG-NEXT: v_or_b32_e32 v2, v2, v10 -; SDAG-NEXT: v_or_b32_e32 v3, v9, v11 +; SDAG-NEXT: v_min_u32_e32 v2, v11, v21 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], 64, v9 +; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[4:5] +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v9, v8, vcc +; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v2 +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v20, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v20, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v10, v8 +; SDAG-NEXT: v_or_b32_e32 v11, v3, v9 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v19, s[4:5] -; SDAG-NEXT: v_and_b32_e32 v2, 1, v2 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v21, s[4:5] +; SDAG-NEXT: v_and_b32_e32 v10, 1, v10 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v17, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v19, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v22, v16, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v19, v21, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v18, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, v17, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc -; SDAG-NEXT: v_cndmask_b32_e64 v23, v20, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v23, v16, 0, s[4:5] ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB0_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v8 -; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v8 -; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v9, vcc -; SDAG-NEXT: v_lshl_b64 v[18:19], v[20:21], v18 -; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc -; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc -; SDAG-NEXT: v_or_b32_e32 v9, v30, v32 -; SDAG-NEXT: v_sub_i32_e32 v11, vcc, 0x7f, v8 -; SDAG-NEXT: v_or_b32_e32 v10, v31, v33 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v11 -; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 64, v11 -; SDAG-NEXT: v_lshl_b64 v[34:35], v[20:21], v11 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] -; SDAG-NEXT: v_lshr_b64 v[8:9], v[20:21], v8 -; SDAG-NEXT: v_or_b32_e32 v9, v23, v9 -; SDAG-NEXT: v_or_b32_e32 v8, v22, v8 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v11 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v19, v9, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v35, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v34, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[4:5] +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v2 +; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v2 ; SDAG-NEXT: v_mov_b32_e32 v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v3, vcc +; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20 +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v8, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v2 +; SDAG-NEXT: v_or_b32_e32 v9, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[18:19], v34 +; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v34 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v35 +; SDAG-NEXT: v_or_b32_e32 v3, v3, v9 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v8 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v8, v21, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v22, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v8, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v18, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[2:3], v[20:21], v30 -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 64, v30 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[16:17], v10 -; SDAG-NEXT: v_or_b32_e32 v11, v3, v11 -; SDAG-NEXT: v_or_b32_e32 v10, v2, v10 -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 -; SDAG-NEXT: v_subrev_i32_e64 v2, s[4:5], 64, v30 -; SDAG-NEXT: v_lshr_b64 v[2:3], v[16:17], v2 -; SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v21, v3, v21, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v20, v2, v20, s[4:5] -; SDAG-NEXT: v_lshr_b64 v[2:3], v[16:17], v30 -; SDAG-NEXT: v_cndmask_b32_e32 v23, 0, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v22, 0, v2, vcc +; SDAG-NEXT: v_lshr_b64 v[10:11], v[16:17], v30 +; SDAG-NEXT: v_sub_i32_e32 v20, vcc, 64, v30 +; SDAG-NEXT: v_subrev_i32_e32 v21, vcc, 64, v30 +; SDAG-NEXT: v_lshr_b64 v[22:23], v[18:19], v30 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29 +; SDAG-NEXT: v_lshl_b64 v[35:36], v[18:19], v20 +; SDAG-NEXT: v_lshr_b64 v[18:19], v[18:19], v21 +; SDAG-NEXT: v_or_b32_e32 v36, v11, v36 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v35 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v36, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v22, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v17, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc -; SDAG-NEXT: v_mov_b32_e32 v10, 0 -; SDAG-NEXT: v_mov_b32_e32 v11, 0 -; SDAG-NEXT: s_mov_b64 s[4:5], 0 -; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v10, v16, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: v_mov_b32_e32 v17, 0 ; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshrrev_b32_e32 v2, 31, v19 -; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 ; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v21 -; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v9 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; SDAG-NEXT: v_or_b32_e32 v19, v17, v19 -; SDAG-NEXT: v_or_b32_e32 v18, v16, v18 -; SDAG-NEXT: v_or_b32_e32 v16, v22, v38 -; SDAG-NEXT: v_or_b32_e32 v17, v20, v39 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v2 -; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v34, v17 -; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v35, v21, vcc -; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v36, v16, vcc -; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v37, v23, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v2 -; SDAG-NEXT: v_and_b32_e32 v20, v2, v29 -; SDAG-NEXT: v_and_b32_e32 v22, v2, v28 -; SDAG-NEXT: v_and_b32_e32 v38, v2, v0 -; SDAG-NEXT: v_and_b32_e32 v39, v2, v1 -; SDAG-NEXT: v_and_b32_e32 v2, 1, v2 -; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v17, v20 -; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v21, v22, vcc -; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v16, v38, vcc -; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v39, vcc +; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v19 +; SDAG-NEXT: v_or_b32_e32 v10, v22, v10 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v9 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v22 +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v34, v18 +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v35, v19, vcc +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v36, v10, vcc +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v37, v23, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v48, 31, v22 +; SDAG-NEXT: v_and_b32_e32 v22, v48, v29 +; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v18, v22 +; SDAG-NEXT: v_and_b32_e32 v22, v48, v28 +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v19, v22, vcc +; SDAG-NEXT: v_and_b32_e32 v22, v48, v0 +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v10, v22, vcc +; SDAG-NEXT: v_and_b32_e32 v10, v48, v1 +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v10, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v30, v32 -; SDAG-NEXT: v_or_b32_e32 v17, v31, v33 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_or_b32_e32 v9, v11, v9 -; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v8, v10, v8 -; SDAG-NEXT: v_mov_b32_e32 v17, v3 -; SDAG-NEXT: v_mov_b32_e32 v16, v2 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 +; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v3 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v10 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_or_b32_e32 v3, v17, v3 +; SDAG-NEXT: v_or_b32_e32 v2, v16, v2 +; SDAG-NEXT: v_or_b32_e32 v9, v21, v9 +; SDAG-NEXT: v_and_b32_e32 v10, 1, v48 +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v8, v20, v8 +; SDAG-NEXT: v_mov_b32_e32 v17, v11 +; SDAG-NEXT: v_mov_b32_e32 v16, v10 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB0_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 -; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB0_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v19 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[18:19], 1 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v16 -; SDAG-NEXT: v_or_b32_e32 v18, v11, v1 -; SDAG-NEXT: v_or_b32_e32 v19, v3, v9 -; SDAG-NEXT: v_or_b32_e32 v22, v10, v0 -; SDAG-NEXT: v_or_b32_e32 v23, v2, v8 +; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v3 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 +; SDAG-NEXT: v_or_b32_e32 v21, v21, v1 +; SDAG-NEXT: v_or_b32_e32 v22, v11, v3 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v0 +; SDAG-NEXT: v_or_b32_e32 v23, v10, v2 ; SDAG-NEXT: .LBB0_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v7 ; SDAG-NEXT: v_ashrrev_i32_e32 v17, 31, v15 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f -; SDAG-NEXT: v_mov_b32_e32 v20, v16 -; SDAG-NEXT: v_mov_b32_e32 v21, v17 +; SDAG-NEXT: v_mov_b32_e32 v18, v16 +; SDAG-NEXT: v_mov_b32_e32 v19, v17 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v5, vcc ; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v7, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v8, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v1, v2 -; SDAG-NEXT: v_ffbh_u32_e32 v4, v3 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, v0, s[4:5] -; SDAG-NEXT: v_sub_i32_e32 v5, vcc, 0, v12 -; SDAG-NEXT: v_or_b32_e32 v0, v2, v8 -; SDAG-NEXT: v_ffbh_u32_e32 v6, v8 -; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], 32, v1 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v1, v3, v9 -; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], 32, v6 -; SDAG-NEXT: v_ffbh_u32_e32 v30, v9 -; SDAG-NEXT: v_min_u32_e32 v4, v7, v4 -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, 0, v14, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v6, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[4:5] +; SDAG-NEXT: v_sub_i32_e32 v7, vcc, 0, v12 +; SDAG-NEXT: v_or_b32_e32 v0, v2, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v8, v4 +; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v1 +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v13, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v3, v5 +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], 32, v8 +; SDAG-NEXT: v_ffbh_u32_e32 v30, v5 +; SDAG-NEXT: v_min_u32_e32 v6, v10, v6 +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v14, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] -; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v10, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v7, s[4:5] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1] -; SDAG-NEXT: v_min_u32_e32 v1, v6, v30 -; SDAG-NEXT: v_add_i32_e64 v4, s[8:9], 64, v4 -; SDAG-NEXT: v_addc_u32_e64 v5, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, 0, v15, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v7, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v7, v29 -; SDAG-NEXT: v_ffbh_u32_e32 v10, v28 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_cndmask_b32_e64 v12, v5, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v13, v4, v1, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v6, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v4, v29, v0 -; SDAG-NEXT: v_ffbh_u32_e32 v6, v0 -; SDAG-NEXT: v_add_i32_e32 v7, vcc, 32, v7 -; SDAG-NEXT: v_or_b32_e32 v5, v28, v1 -; SDAG-NEXT: v_add_i32_e32 v6, vcc, 32, v6 +; SDAG-NEXT: v_min_u32_e32 v1, v8, v30 +; SDAG-NEXT: v_add_i32_e64 v6, s[8:9], 64, v6 +; SDAG-NEXT: v_addc_u32_e64 v7, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v15, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v10, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v10, v29 +; SDAG-NEXT: v_ffbh_u32_e32 v11, v28 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v7, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v13, v6, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v8, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v6, v29, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v8, v0 +; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10 +; SDAG-NEXT: v_or_b32_e32 v7, v28, v1 +; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v8 ; SDAG-NEXT: v_ffbh_u32_e32 v14, v1 -; SDAG-NEXT: v_min_u32_e32 v7, v7, v10 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; SDAG-NEXT: v_min_u32_e32 v4, v6, v14 -; SDAG-NEXT: v_add_i32_e64 v5, s[4:5], 64, v7 -; SDAG-NEXT: v_addc_u32_e64 v6, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v10, v10, v11 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_min_u32_e32 v6, v8, v14 +; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], 64, v10 +; SDAG-NEXT: v_addc_u32_e64 v8, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v7, v6, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v4, v13 -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v12, vcc -; SDAG-NEXT: v_xor_b32_e32 v4, 0x7f, v6 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v12, vcc +; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v6 +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v9, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v11, vcc -; SDAG-NEXT: v_or_b32_e32 v4, v4, v10 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v10, v8 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v5, v7, v11 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_or_b32_e32 v11, v7, v9 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; SDAG-NEXT: v_and_b32_e32 v4, 1, v12 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v4 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_and_b32_e32 v10, 1, v12 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v13, v9, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v13, v5, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v5, v8, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v4, v2, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v6 ; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v6 -; SDAG-NEXT: v_mov_b32_e32 v4, 0 -; SDAG-NEXT: v_mov_b32_e32 v5, 0 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v7, vcc ; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12 -; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc -; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc -; SDAG-NEXT: v_or_b32_e32 v10, v30, v32 -; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v6 -; SDAG-NEXT: v_or_b32_e32 v11, v31, v33 -; SDAG-NEXT: v_lshl_b64 v[6:7], v[8:9], v34 -; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 -; SDAG-NEXT: v_lshl_b64 v[14:15], v[2:3], v34 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v35 -; SDAG-NEXT: v_or_b32_e32 v7, v7, v11 -; SDAG-NEXT: v_or_b32_e32 v6, v6, v10 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v15, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v8, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc +; SDAG-NEXT: v_or_b32_e32 v7, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v9, vcc, 0x7f, v6 +; SDAG-NEXT: v_or_b32_e32 v8, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[14:15], v[4:5], v9 +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, 64, v9 +; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v9 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] +; SDAG-NEXT: v_lshr_b64 v[6:7], v[2:3], v6 +; SDAG-NEXT: v_or_b32_e32 v7, v15, v7 +; SDAG-NEXT: v_or_b32_e32 v6, v14, v6 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v8, v13, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v35, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v34, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v8, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader -; SDAG-NEXT: v_lshr_b64 v[14:15], v[2:3], v30 -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, 64, v30 -; SDAG-NEXT: v_subrev_i32_e32 v35, vcc, 64, v30 -; SDAG-NEXT: v_lshr_b64 v[37:38], v[8:9], v30 +; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v30 +; SDAG-NEXT: v_sub_i32_e32 v12, vcc, 64, v30 +; SDAG-NEXT: v_subrev_i32_e32 v13, vcc, 64, v30 +; SDAG-NEXT: v_lshr_b64 v[14:15], v[4:5], v30 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29 +; SDAG-NEXT: v_lshl_b64 v[35:36], v[4:5], v12 +; SDAG-NEXT: v_lshr_b64 v[4:5], v[4:5], v13 +; SDAG-NEXT: v_or_b32_e32 v36, v11, v36 ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v5, 0 -; SDAG-NEXT: v_lshl_b64 v[48:49], v[8:9], v4 -; SDAG-NEXT: v_lshr_b64 v[8:9], v[8:9], v35 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v35 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v36, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v4, v15, v49 -; SDAG-NEXT: v_or_b32_e32 v14, v14, v48 +; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v15, 0, v38, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v14, 0, v37, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v15, 0, v15, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc -; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; SDAG-NEXT: v_cndmask_b32_e32 v9, v4, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 ; SDAG-NEXT: .LBB0_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v9 -; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v7 -; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v11 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; SDAG-NEXT: v_or_b32_e32 v4, v14, v4 -; SDAG-NEXT: v_or_b32_e32 v8, v8, v38 -; SDAG-NEXT: v_or_b32_e32 v6, v6, v39 -; SDAG-NEXT: v_or_b32_e32 v7, v13, v7 -; SDAG-NEXT: v_or_b32_e32 v11, v3, v11 -; SDAG-NEXT: v_sub_i32_e32 v3, vcc, v34, v8 -; SDAG-NEXT: v_or_b32_e32 v6, v12, v6 -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v35, v9, vcc -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v36, v4, vcc -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v37, v15, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v3 -; SDAG-NEXT: v_and_b32_e32 v3, v38, v29 -; SDAG-NEXT: v_and_b32_e32 v14, v38, v28 -; SDAG-NEXT: v_and_b32_e32 v39, v38, v0 -; SDAG-NEXT: v_and_b32_e32 v48, v38, v1 -; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v3 -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v14, vcc -; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v4, v39, vcc -; SDAG-NEXT: v_subb_u32_e32 v15, vcc, v15, v48, vcc +; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v5 +; SDAG-NEXT: v_or_b32_e32 v10, v14, v10 +; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v14, 31, v9 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v14 +; SDAG-NEXT: v_sub_i32_e32 v14, vcc, v34, v4 +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v35, v5, vcc +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v36, v10, vcc +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v37, v15, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v48, 31, v14 +; SDAG-NEXT: v_and_b32_e32 v14, v48, v29 +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v14 +; SDAG-NEXT: v_and_b32_e32 v14, v48, v28 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v14, vcc +; SDAG-NEXT: v_and_b32_e32 v14, v48, v0 +; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v10, v14, vcc +; SDAG-NEXT: v_and_b32_e32 v10, v48, v1 +; SDAG-NEXT: v_subb_u32_e32 v15, vcc, v15, v10, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc -; SDAG-NEXT: v_or_b32_e32 v3, v30, v32 -; SDAG-NEXT: v_or_b32_e32 v4, v31, v33 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[3:4] -; SDAG-NEXT: v_and_b32_e32 v4, 1, v38 +; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 +; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v7 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v10 +; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; SDAG-NEXT: v_or_b32_e32 v7, v3, v7 +; SDAG-NEXT: v_or_b32_e32 v6, v2, v6 +; SDAG-NEXT: v_or_b32_e32 v9, v13, v9 +; SDAG-NEXT: v_and_b32_e32 v10, 1, v48 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v10, v2, v10 -; SDAG-NEXT: v_mov_b32_e32 v2, v4 -; SDAG-NEXT: v_mov_b32_e32 v3, v5 +; SDAG-NEXT: v_or_b32_e32 v8, v12, v8 +; SDAG-NEXT: v_mov_b32_e32 v2, v10 +; SDAG-NEXT: v_mov_b32_e32 v3, v11 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB0_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB0_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v11 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[10:11], 1 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v6 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[6:7], 1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; SDAG-NEXT: v_or_b32_e32 v13, v13, v1 -; SDAG-NEXT: v_or_b32_e32 v14, v5, v3 -; SDAG-NEXT: v_or_b32_e32 v5, v12, v0 -; SDAG-NEXT: v_or_b32_e32 v4, v4, v2 +; SDAG-NEXT: v_or_b32_e32 v14, v11, v3 +; SDAG-NEXT: v_or_b32_e32 v11, v12, v0 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v2 ; SDAG-NEXT: .LBB0_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26 ; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24 -; SDAG-NEXT: v_xor_b32_e32 v7, v21, v20 +; SDAG-NEXT: v_xor_b32_e32 v7, v19, v18 ; SDAG-NEXT: v_xor_b32_e32 v6, v17, v16 -; SDAG-NEXT: v_xor_b32_e32 v8, v18, v3 -; SDAG-NEXT: v_xor_b32_e32 v9, v22, v2 -; SDAG-NEXT: v_xor_b32_e32 v1, v19, v3 +; SDAG-NEXT: v_xor_b32_e32 v4, v21, v3 +; SDAG-NEXT: v_xor_b32_e32 v5, v20, v2 +; SDAG-NEXT: v_xor_b32_e32 v1, v22, v3 ; SDAG-NEXT: v_xor_b32_e32 v0, v23, v2 -; SDAG-NEXT: v_xor_b32_e32 v10, v13, v7 -; SDAG-NEXT: v_xor_b32_e32 v11, v5, v6 -; SDAG-NEXT: v_xor_b32_e32 v5, v14, v7 +; SDAG-NEXT: v_xor_b32_e32 v8, v13, v7 +; SDAG-NEXT: v_xor_b32_e32 v9, v11, v6 +; SDAG-NEXT: v_xor_b32_e32 v11, v14, v7 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v9, v2, vcc -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v3, vcc -; SDAG-NEXT: v_xor_b32_e32 v4, v4, v6 +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v5, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc +; SDAG-NEXT: v_xor_b32_e32 v4, v10, v6 ; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v7, vcc -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v11, v6, vcc -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v10, v7, vcc +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v11, v7, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v9, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v7, vcc ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: v_sdiv_v2i128_vv: @@ -498,10 +498,10 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v22, v18, 0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v23, v19, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, v20, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v9, v21, 0, vcc ; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GISEL-NEXT: v_cndmask_b32_e64 v23, v19, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, v21, 0, vcc ; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB0_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 @@ -537,71 +537,71 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9] ; GISEL-NEXT: s_cbranch_execz .LBB0_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 -; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v28 -; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v28 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v28 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28 +; GISEL-NEXT: v_add_i32_e32 v22, vcc, 0xffffffc0, v28 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, 64, v28 ; GISEL-NEXT: v_lshr_b64 v[0:1], v[20:21], v28 -; GISEL-NEXT: v_lshr_b64 v[2:3], v[18:19], v28 -; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], v22 -; GISEL-NEXT: v_or_b32_e32 v22, v2, v22 -; GISEL-NEXT: v_or_b32_e32 v23, v3, v23 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], v2 +; GISEL-NEXT: v_lshr_b64 v[20:21], v[20:21], v22 +; GISEL-NEXT: v_lshr_b64 v[22:23], v[18:19], v28 ; GISEL-NEXT: s_mov_b64 s[8:9], 0 -; GISEL-NEXT: v_lshr_b64 v[2:3], v[20:21], v32 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v22, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v23, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v18, v2, v18, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v19, v3, v19, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc +; GISEL-NEXT: v_or_b32_e32 v2, v22, v2 +; GISEL-NEXT: v_or_b32_e32 v3, v23, v3 ; GISEL-NEXT: v_add_i32_e32 v32, vcc, -1, v26 ; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v27, vcc +; GISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v20, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v21, v3, s[4:5] ; GISEL-NEXT: v_addc_u32_e32 v34, vcc, -1, v10, vcc ; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v11, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] -; GISEL-NEXT: v_mov_b32_e32 v23, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, v0, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, v1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v28 +; GISEL-NEXT: v_cndmask_b32_e32 v20, v2, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v21, v3, v19, vcc +; GISEL-NEXT: v_mov_b32_e32 v19, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 ; GISEL-NEXT: .LBB0_3: ; %udiv-do-while3 ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_lshrrev_b32_e32 v36, 31, v17 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[16:17], 1 -; GISEL-NEXT: v_or_b32_e32 v16, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v17, v1, v3 -; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v19 -; GISEL-NEXT: v_lshl_b64 v[0:1], v[18:19], 1 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 -; GISEL-NEXT: v_lshrrev_b32_e32 v18, 31, v9 -; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 -; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v32, v0 -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v33, v1, vcc -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v34, v2, vcc -; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v35, v3, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v22, 31, v18 -; GISEL-NEXT: v_and_b32_e32 v18, v22, v26 -; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v0, v18 -; GISEL-NEXT: v_and_b32_e32 v0, v22, v27 -; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v1, v0, vcc -; GISEL-NEXT: v_and_b32_e32 v0, v22, v10 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v2, v0, vcc -; GISEL-NEXT: v_and_b32_e32 v0, v22, v11 -; GISEL-NEXT: v_subb_u32_e32 v21, vcc, v3, v0, vcc ; GISEL-NEXT: v_add_i32_e32 v28, vcc, -1, v28 ; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc ; GISEL-NEXT: v_addc_u32_e32 v30, vcc, -1, v30, vcc ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc -; GISEL-NEXT: v_or_b32_e32 v0, v28, v30 -; GISEL-NEXT: v_or_b32_e32 v1, v29, v31 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_and_b32_e32 v22, 1, v22 +; GISEL-NEXT: v_or_b32_e32 v2, v28, v30 +; GISEL-NEXT: v_or_b32_e32 v3, v29, v31 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_lshl_b64 v[2:3], v[22:23], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v18, 31, v21 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v18 +; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v18, 31, v9 +; GISEL-NEXT: v_or_b32_e32 v18, v20, v18 ; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v20, 31, v17 +; GISEL-NEXT: v_or_b32_e32 v8, v8, v20 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; GISEL-NEXT: v_or_b32_e32 v16, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v17, v1, v17 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v32, v18 +; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], v33, v21, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], v34, v2, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], v35, v3, s[4:5] +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GISEL-NEXT: v_and_b32_e32 v1, v0, v26 +; GISEL-NEXT: v_and_b32_e32 v22, v0, v27 +; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], v18, v1 +; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], v21, v22, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v1, v0, v10 +; GISEL-NEXT: v_and_b32_e32 v23, v0, v11 +; GISEL-NEXT: v_subb_u32_e64 v22, s[4:5], v2, v1, s[4:5] ; GISEL-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GISEL-NEXT: v_or_b32_e32 v8, v8, v36 -; GISEL-NEXT: v_mov_b32_e32 v0, v22 -; GISEL-NEXT: v_mov_b32_e32 v1, v23 +; GISEL-NEXT: v_and_b32_e32 v18, 1, v0 +; GISEL-NEXT: v_subb_u32_e64 v23, vcc, v3, v23, s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v0, v18 +; GISEL-NEXT: v_mov_b32_e32 v1, v19 ; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execnz .LBB0_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 @@ -619,43 +619,43 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v19, 31, v15 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x7f -; GISEL-NEXT: v_mov_b32_e32 v17, 0 +; GISEL-NEXT: v_mov_b32_e32 v10, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v11, 0 ; GISEL-NEXT: v_xor_b32_e32 v0, v18, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v18, v5 ; GISEL-NEXT: v_xor_b32_e32 v2, v18, v6 ; GISEL-NEXT: v_xor_b32_e32 v3, v18, v7 ; GISEL-NEXT: v_xor_b32_e32 v4, v19, v12 ; GISEL-NEXT: v_xor_b32_e32 v5, v19, v13 -; GISEL-NEXT: v_xor_b32_e32 v12, v19, v14 -; GISEL-NEXT: v_xor_b32_e32 v13, v19, v15 +; GISEL-NEXT: v_xor_b32_e32 v14, v19, v14 +; GISEL-NEXT: v_xor_b32_e32 v15, v19, v15 ; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v18 ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v1, v18, vcc ; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], v4, v19 ; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], v5, v19, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v2, v18, vcc -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v3, v18, vcc -; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v12, v19, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v13, v19, vcc -; GISEL-NEXT: v_ffbh_u32_e32 v12, v21 -; GISEL-NEXT: v_ffbh_u32_e32 v13, v20 -; GISEL-NEXT: v_ffbh_u32_e32 v14, v7 -; GISEL-NEXT: v_ffbh_u32_e32 v15, v6 +; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v2, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v18, vcc +; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v19, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v19, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v14, v21 +; GISEL-NEXT: v_ffbh_u32_e32 v15, v20 +; GISEL-NEXT: v_ffbh_u32_e32 v16, v7 +; GISEL-NEXT: v_ffbh_u32_e32 v17, v6 ; GISEL-NEXT: v_or_b32_e32 v0, v20, v4 ; GISEL-NEXT: v_or_b32_e32 v1, v21, v5 -; GISEL-NEXT: v_or_b32_e32 v2, v6, v10 -; GISEL-NEXT: v_or_b32_e32 v3, v7, v11 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 32, v13 +; GISEL-NEXT: v_or_b32_e32 v2, v6, v12 +; GISEL-NEXT: v_or_b32_e32 v3, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15 ; GISEL-NEXT: v_ffbh_u32_e32 v26, v5 ; GISEL-NEXT: v_ffbh_u32_e32 v27, v4 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15 -; GISEL-NEXT: v_ffbh_u32_e32 v28, v11 -; GISEL-NEXT: v_ffbh_u32_e32 v29, v10 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, 32, v17 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v13 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v12 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] -; GISEL-NEXT: v_min_u32_e32 v0, v12, v13 +; GISEL-NEXT: v_min_u32_e32 v0, v14, v15 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v27 -; GISEL-NEXT: v_min_u32_e32 v2, v14, v15 +; GISEL-NEXT: v_min_u32_e32 v2, v16, v17 ; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v29 ; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 ; GISEL-NEXT: v_min_u32_e32 v1, v26, v1 @@ -665,32 +665,32 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 ; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[16:17] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11] ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, 0x7f, v2 +; GISEL-NEXT: v_xor_b32_e32 v10, 0x7f, v2 ; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v12, v12, v0 -; GISEL-NEXT: v_or_b32_e32 v13, v3, v1 +; GISEL-NEXT: v_or_b32_e32 v10, v10, v0 +; GISEL-NEXT: v_or_b32_e32 v11, v3, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v13, v14, v15 -; GISEL-NEXT: v_and_b32_e32 v14, 1, v13 -; GISEL-NEXT: v_or_b32_e32 v12, v13, v12 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v11, v14, v15 +; GISEL-NEXT: v_and_b32_e32 v14, 1, v11 +; GISEL-NEXT: v_or_b32_e32 v10, v11, v10 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, v6, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v16, 1, v12 +; GISEL-NEXT: v_and_b32_e32 v16, 1, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v15, v7, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] @@ -703,22 +703,22 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v0, s[4:5] ; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v1, vcc ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v30, v2 -; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], 64, v30 +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v30 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[6:7], v30 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[10:11], v30 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], v30 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: v_lshr_b64 v[12:13], v[6:7], v12 +; GISEL-NEXT: v_lshr_b64 v[10:11], v[6:7], v10 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v14 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 ; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v1, vcc -; GISEL-NEXT: v_or_b32_e32 v0, v12, v2 -; GISEL-NEXT: v_or_b32_e32 v1, v13, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v10, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v11, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v0, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, v1, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v0, v12, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, v1, v13, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 @@ -728,28 +728,28 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB0_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader -; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v26 +; GISEL-NEXT: v_add_i32_e32 v30, vcc, 0xffffffc0, v26 ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 64, v26 -; GISEL-NEXT: v_lshr_b64 v[0:1], v[10:11], v26 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v26 ; GISEL-NEXT: v_lshr_b64 v[2:3], v[6:7], v26 ; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[12:13], v16 +; GISEL-NEXT: v_lshr_b64 v[12:13], v[12:13], v30 ; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v20 ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v21, vcc -; GISEL-NEXT: v_lshl_b64 v[16:17], v[10:11], v16 -; GISEL-NEXT: v_lshr_b64 v[10:11], v[10:11], v32 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 +; GISEL-NEXT: v_or_b32_e32 v3, v3, v17 ; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v4, vcc ; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v5, vcc ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 -; GISEL-NEXT: v_or_b32_e32 v3, v3, v17 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v2, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, v3, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, v2, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, v3, v7, vcc ; GISEL-NEXT: v_mov_b32_e32 v7, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 @@ -757,20 +757,20 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: .LBB0_9: ; %udiv-do-while ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[10:11], 1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], 1 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v13 +; GISEL-NEXT: v_or_b32_e32 v16, v16, v6 ; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v11 -; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v13 -; GISEL-NEXT: v_lshl_b64 v[10:11], v[14:15], 1 -; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v15 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v6 +; GISEL-NEXT: v_lshl_b64 v[12:13], v[14:15], 1 +; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v15 ; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v26 ; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc -; GISEL-NEXT: v_or_b32_e32 v16, v16, v6 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v34 -; GISEL-NEXT: v_or_b32_e32 v12, v12, v14 -; GISEL-NEXT: v_or_b32_e32 v14, v0, v10 -; GISEL-NEXT: v_or_b32_e32 v15, v1, v11 +; GISEL-NEXT: v_or_b32_e32 v10, v10, v6 +; GISEL-NEXT: v_or_b32_e32 v14, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v15, v1, v13 ; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc ; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v30, v2 @@ -782,17 +782,17 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v6 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v1, v0, v20 +; GISEL-NEXT: v_and_b32_e32 v13, v0, v21 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v1 ; GISEL-NEXT: v_and_b32_e32 v6, 1, v0 -; GISEL-NEXT: v_and_b32_e32 v10, v0, v20 -; GISEL-NEXT: v_and_b32_e32 v11, v0, v21 -; GISEL-NEXT: v_and_b32_e32 v34, v0, v4 -; GISEL-NEXT: v_and_b32_e32 v35, v0, v5 +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v13, vcc +; GISEL-NEXT: v_and_b32_e32 v1, v0, v4 +; GISEL-NEXT: v_and_b32_e32 v2, v0, v5 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v6 ; GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v3, v11, vcc -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v34, vcc -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v2, vcc ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GISEL-NEXT: s_cbranch_execnz .LBB0_9 ; GISEL-NEXT: ; %bb.10: ; %Flow @@ -800,9 +800,9 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: .LBB0_11: ; %Flow11 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1 -; GISEL-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v15 -; GISEL-NEXT: v_or_b32_e32 v12, v12, v4 +; GISEL-NEXT: v_or_b32_e32 v10, v10, v4 ; GISEL-NEXT: v_or_b32_e32 v14, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v15, v1, v3 ; GISEL-NEXT: .LBB0_12: ; %Flow12 @@ -815,8 +815,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_xor_b32_e32 v6, v9, v3 ; GISEL-NEXT: v_xor_b32_e32 v4, v14, v7 ; GISEL-NEXT: v_xor_b32_e32 v5, v15, v7 -; GISEL-NEXT: v_xor_b32_e32 v8, v12, v7 -; GISEL-NEXT: v_xor_b32_e32 v9, v13, v7 +; GISEL-NEXT: v_xor_b32_e32 v8, v10, v7 +; GISEL-NEXT: v_xor_b32_e32 v9, v11, v7 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v7 @@ -869,144 +869,144 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc -; SDAG-NEXT: v_sub_i32_e32 v21, vcc, v16, v18 -; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v20, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v21 -; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[21:22] -; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v22 ; SDAG-NEXT: v_subb_u32_e32 v24, vcc, 0, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v16, v23 -; SDAG-NEXT: v_or_b32_e32 v17, v22, v24 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[23:24] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[22:23] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, 0, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 +; SDAG-NEXT: v_or_b32_e32 v17, v23, v25 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25] ; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[23:24] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[24:25] ; SDAG-NEXT: v_cndmask_b32_e64 v16, v19, v18, s[4:5] ; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v3, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v3, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v2, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v2, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v1, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc -; SDAG-NEXT: v_cndmask_b32_e64 v19, v0, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v0, 0, s[4:5] ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB1_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v18, vcc, 1, v21 -; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v21 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_add_i32_e32 v26, vcc, 1, v22 +; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v22 ; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v22, vcc +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v23, vcc ; SDAG-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 -; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v23, vcc -; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v24, vcc -; SDAG-NEXT: v_or_b32_e32 v22, v18, v28 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v21 -; SDAG-NEXT: v_or_b32_e32 v23, v27, v29 -; SDAG-NEXT: v_lshl_b64 v[24:25], v[2:3], v26 -; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 64, v26 -; SDAG-NEXT: v_lshl_b64 v[30:31], v[0:1], v26 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[22:23] -; SDAG-NEXT: v_lshr_b64 v[21:22], v[0:1], v21 -; SDAG-NEXT: v_or_b32_e32 v22, v25, v22 -; SDAG-NEXT: v_or_b32_e32 v21, v24, v21 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v22, v17, v22, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, v16, v21, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v30, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, v21, v2, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v24, vcc +; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v25, vcc +; SDAG-NEXT: v_or_b32_e32 v18, v26, v28 +; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v22 +; SDAG-NEXT: v_or_b32_e32 v19, v27, v29 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v30 +; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v30 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v30 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v31 +; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 +; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v17, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v16, v18, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v24, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v2, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 -; SDAG-NEXT: v_mov_b32_e32 v24, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 -; SDAG-NEXT: v_lshr_b64 v[19:20], v[0:1], v18 -; SDAG-NEXT: v_sub_i32_e32 v23, vcc, 64, v18 -; SDAG-NEXT: v_lshl_b64 v[23:24], v[2:3], v23 -; SDAG-NEXT: v_or_b32_e32 v24, v20, v24 -; SDAG-NEXT: v_or_b32_e32 v23, v19, v23 -; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 -; SDAG-NEXT: v_subrev_i32_e64 v19, s[4:5], 64, v18 -; SDAG-NEXT: v_lshr_b64 v[19:20], v[2:3], v19 -; SDAG-NEXT: v_cndmask_b32_e32 v20, v20, v24, vcc -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v20, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e32 v19, v19, v23, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v0, v19, v0, s[4:5] -; SDAG-NEXT: v_lshr_b64 v[2:3], v[2:3], v18 -; SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v26 +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 64, v26 +; SDAG-NEXT: v_subrev_i32_e32 v23, vcc, 64, v26 +; SDAG-NEXT: v_lshr_b64 v[24:25], v[2:3], v26 ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v8 +; SDAG-NEXT: v_lshl_b64 v[31:32], v[2:3], v22 +; SDAG-NEXT: v_lshr_b64 v[2:3], v[2:3], v23 +; SDAG-NEXT: v_or_b32_e32 v32, v21, v32 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v31 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v20, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v25, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v24, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v1, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc -; SDAG-NEXT: v_mov_b32_e32 v23, 0 -; SDAG-NEXT: v_mov_b32_e32 v24, 0 -; SDAG-NEXT: s_mov_b64 s[4:5], 0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: v_mov_b32_e32 v25, 0 -; SDAG-NEXT: v_mov_b32_e32 v26, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-NEXT: .LBB1_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v17 -; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; SDAG-NEXT: v_or_b32_e32 v17, v26, v17 -; SDAG-NEXT: v_or_b32_e32 v16, v25, v16 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v3 +; SDAG-NEXT: v_or_b32_e32 v20, v24, v20 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v19, 31, v1 -; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v19 -; SDAG-NEXT: v_lshrrev_b32_e32 v19, 31, v22 -; SDAG-NEXT: v_or_b32_e32 v0, v0, v19 -; SDAG-NEXT: v_sub_i32_e32 v19, vcc, v30, v0 -; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v31, v1, vcc -; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v32, v2, vcc -; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v33, v3, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v19, 31, v19 -; SDAG-NEXT: v_and_b32_e32 v25, v19, v8 -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v25 -; SDAG-NEXT: v_and_b32_e32 v25, v19, v9 -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v25, vcc -; SDAG-NEXT: v_and_b32_e32 v25, v19, v10 -; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v25, vcc -; SDAG-NEXT: v_and_b32_e32 v25, v19, v11 -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v25, vcc -; SDAG-NEXT: v_add_i32_e32 v18, vcc, -1, v18 +; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v19 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v24 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v30, v2 +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v31, v3, vcc +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v32, v20, vcc +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v33, v25, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v36, 31, v24 +; SDAG-NEXT: v_and_b32_e32 v24, v36, v8 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v24 +; SDAG-NEXT: v_and_b32_e32 v24, v36, v9 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v24, vcc +; SDAG-NEXT: v_and_b32_e32 v24, v36, v10 +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v20, v24, vcc +; SDAG-NEXT: v_and_b32_e32 v20, v36, v11 +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v20, vcc +; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v26 ; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc ; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc ; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc -; SDAG-NEXT: v_or_b32_e32 v25, v18, v28 -; SDAG-NEXT: v_or_b32_e32 v26, v27, v29 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26] -; SDAG-NEXT: v_and_b32_e32 v19, 1, v19 -; SDAG-NEXT: v_lshl_b64 v[21:22], v[21:22], 1 -; SDAG-NEXT: v_or_b32_e32 v21, v21, v34 -; SDAG-NEXT: v_or_b32_e32 v22, v24, v22 -; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 -; SDAG-NEXT: v_mov_b32_e32 v26, v20 -; SDAG-NEXT: v_mov_b32_e32 v25, v19 -; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v34, v26, v28 +; SDAG-NEXT: v_or_b32_e32 v35, v27, v29 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[34:35] +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v17 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_or_b32_e32 v17, v1, v17 +; SDAG-NEXT: v_or_b32_e32 v16, v0, v16 +; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 +; SDAG-NEXT: v_and_b32_e32 v20, 1, v36 +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 +; SDAG-NEXT: v_mov_b32_e32 v0, v20 +; SDAG-NEXT: v_mov_b32_e32 v1, v21 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB1_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 -; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB1_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[21:22], 1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[18:19], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v17 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 -; SDAG-NEXT: v_or_b32_e32 v16, v24, v1 -; SDAG-NEXT: v_or_b32_e32 v18, v20, v3 -; SDAG-NEXT: v_or_b32_e32 v17, v23, v0 -; SDAG-NEXT: v_or_b32_e32 v19, v19, v2 +; SDAG-NEXT: v_or_b32_e32 v19, v23, v1 +; SDAG-NEXT: v_or_b32_e32 v16, v21, v3 +; SDAG-NEXT: v_or_b32_e32 v18, v22, v0 +; SDAG-NEXT: v_or_b32_e32 v17, v20, v2 ; SDAG-NEXT: .LBB1_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_or_b32_e32 v1, v13, v15 @@ -1044,22 +1044,22 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v1, vcc -; SDAG-NEXT: v_xor_b32_e32 v0, 0x7f, v2 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc +; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v0 ; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v24, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[2:3] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1] ; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v24, vcc -; SDAG-NEXT: v_or_b32_e32 v0, v0, v20 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v20 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v1, v3, v21 +; SDAG-NEXT: v_or_b32_e32 v3, v1, v21 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_and_b32_e32 v0, 1, v8 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_and_b32_e32 v2, 1, v8 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v7, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 @@ -1070,32 +1070,32 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 -; SDAG-NEXT: v_add_i32_e32 v22, vcc, 1, v2 -; SDAG-NEXT: v_sub_i32_e64 v8, s[4:5], 63, v2 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v1, 0 -; SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v3, vcc +; SDAG-NEXT: v_add_i32_e32 v22, vcc, 1, v0 +; SDAG-NEXT: v_sub_i32_e64 v8, s[4:5], 63, v0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc ; SDAG-NEXT: v_lshl_b64 v[8:9], v[4:5], v8 ; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v20, vcc ; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v21, vcc ; SDAG-NEXT: v_or_b32_e32 v10, v22, v24 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v2 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v0 ; SDAG-NEXT: v_or_b32_e32 v11, v23, v25 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[6:7], v26 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], v26 ; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v26 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_lshr_b64 v[10:11], v[4:5], v27 -; SDAG-NEXT: v_or_b32_e32 v3, v3, v11 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v10 +; SDAG-NEXT: v_or_b32_e32 v1, v1, v11 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v10 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, v21, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v20, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1103,60 +1103,60 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_cbranch_execz .LBB1_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader ; SDAG-NEXT: v_lshr_b64 v[10:11], v[4:5], v22 -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 64, v22 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, 64, v22 ; SDAG-NEXT: v_subrev_i32_e32 v27, vcc, 64, v22 ; SDAG-NEXT: v_lshr_b64 v[29:30], v[6:7], v22 ; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v12 ; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v1, 0 -; SDAG-NEXT: v_lshl_b64 v[31:32], v[6:7], v0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_lshl_b64 v[31:32], v[6:7], v2 ; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v27 ; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v0, v11, v32 +; SDAG-NEXT: v_or_b32_e32 v2, v11, v32 ; SDAG-NEXT: v_or_b32_e32 v10, v10, v31 ; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v14, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22 -; SDAG-NEXT: v_cndmask_b32_e64 v0, v7, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v30, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v29, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v15, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 -; SDAG-NEXT: v_cndmask_b32_e32 v7, v0, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v7, v2, v5, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc ; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: v_mov_b32_e32 v5, 0 ; SDAG-NEXT: .LBB1_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v0, 31, v7 +; SDAG-NEXT: v_lshrrev_b32_e32 v2, 31, v7 ; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v3 -; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v9 +; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v9 ; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v0 -; SDAG-NEXT: v_or_b32_e32 v0, v6, v30 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v31 -; SDAG-NEXT: v_or_b32_e32 v3, v21, v3 -; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v26, v0 -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v27, v7, vcc -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v28, v10, vcc -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v29, v11, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v30, 31, v6 -; SDAG-NEXT: v_and_b32_e32 v31, v30, v13 -; SDAG-NEXT: v_and_b32_e32 v6, v30, v12 -; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v0, v6 -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v31, vcc -; SDAG-NEXT: v_or_b32_e32 v9, v5, v9 -; SDAG-NEXT: v_or_b32_e32 v2, v20, v2 -; SDAG-NEXT: v_and_b32_e32 v0, 1, v30 -; SDAG-NEXT: v_and_b32_e32 v5, v30, v15 -; SDAG-NEXT: v_and_b32_e32 v30, v30, v14 -; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v10, v30, vcc -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v5, vcc +; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v2 +; SDAG-NEXT: v_or_b32_e32 v6, v6, v30 +; SDAG-NEXT: v_or_b32_e32 v2, v8, v31 +; SDAG-NEXT: v_or_b32_e32 v9, v21, v9 +; SDAG-NEXT: v_or_b32_e32 v1, v5, v1 +; SDAG-NEXT: v_or_b32_e32 v8, v20, v2 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v26, v6 +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v27, v7, vcc +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v28, v10, vcc +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v29, v11, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v2 +; SDAG-NEXT: v_and_b32_e32 v2, 1, v5 +; SDAG-NEXT: v_and_b32_e32 v30, v5, v15 +; SDAG-NEXT: v_and_b32_e32 v31, v5, v12 +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v31 +; SDAG-NEXT: v_and_b32_e32 v31, v5, v14 +; SDAG-NEXT: v_and_b32_e32 v5, v5, v13 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v5, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v10, v31, vcc +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v30, vcc ; SDAG-NEXT: v_add_i32_e32 v22, vcc, -1, v22 ; SDAG-NEXT: v_addc_u32_e32 v23, vcc, -1, v23, vcc ; SDAG-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc @@ -1165,29 +1165,29 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v30, v22, v24 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[30:31] ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v8, v4, v8 -; SDAG-NEXT: v_mov_b32_e32 v5, v1 -; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_or_b32_e32 v0, v4, v0 +; SDAG-NEXT: v_mov_b32_e32 v5, v3 +; SDAG-NEXT: v_mov_b32_e32 v4, v2 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB1_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB1_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v9 ; SDAG-NEXT: v_lshl_b64 v[4:5], v[8:9], 1 -; SDAG-NEXT: v_or_b32_e32 v2, v2, v6 -; SDAG-NEXT: v_or_b32_e32 v8, v21, v3 -; SDAG-NEXT: v_or_b32_e32 v10, v1, v5 -; SDAG-NEXT: v_or_b32_e32 v9, v20, v2 -; SDAG-NEXT: v_or_b32_e32 v11, v0, v4 +; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v6 +; SDAG-NEXT: v_or_b32_e32 v8, v21, v5 +; SDAG-NEXT: v_or_b32_e32 v10, v3, v1 +; SDAG-NEXT: v_or_b32_e32 v9, v20, v4 +; SDAG-NEXT: v_or_b32_e32 v11, v2, v0 ; SDAG-NEXT: .LBB1_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v0, v19 -; SDAG-NEXT: v_mov_b32_e32 v1, v18 -; SDAG-NEXT: v_mov_b32_e32 v2, v17 -; SDAG-NEXT: v_mov_b32_e32 v3, v16 +; SDAG-NEXT: v_mov_b32_e32 v0, v17 +; SDAG-NEXT: v_mov_b32_e32 v1, v16 +; SDAG-NEXT: v_mov_b32_e32 v2, v18 +; SDAG-NEXT: v_mov_b32_e32 v3, v19 ; SDAG-NEXT: v_mov_b32_e32 v4, v11 ; SDAG-NEXT: v_mov_b32_e32 v5, v10 ; SDAG-NEXT: v_mov_b32_e32 v6, v9 @@ -1254,11 +1254,11 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v18, v0, 0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v2, v16, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, 0, vcc ; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, 0, vcc +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB1_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 ; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v22 @@ -1289,81 +1289,81 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v20, s10 ; GISEL-NEXT: v_mov_b32_e32 v19, s9 ; GISEL-NEXT: v_mov_b32_e32 v18, s8 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9] ; GISEL-NEXT: s_cbranch_execz .LBB1_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 -; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v26 +; GISEL-NEXT: v_add_i32_e32 v30, vcc, 0xffffffc0, v26 ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v26 ; GISEL-NEXT: v_lshr_b64 v[18:19], v[16:17], v26 ; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v26 -; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_lshl_b64 v[24:25], v[16:17], v24 +; GISEL-NEXT: v_lshr_b64 v[16:17], v[16:17], v30 ; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v8 ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc -; GISEL-NEXT: v_lshl_b64 v[24:25], v[16:17], v24 -; GISEL-NEXT: v_lshr_b64 v[16:17], v[16:17], v32 -; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc -; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc -; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_or_b32_e32 v20, v20, v24 ; GISEL-NEXT: v_or_b32_e32 v21, v21, v25 +; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc +; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 -; GISEL-NEXT: v_cndmask_b32_e32 v20, v16, v20, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v21, v17, v21, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v18, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v19, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v20, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, v17, v21, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v24, 0, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v19, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 -; GISEL-NEXT: v_cndmask_b32_e32 v24, v20, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v25, v21, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: v_mov_b32_e32 v21, s7 -; GISEL-NEXT: v_mov_b32_e32 v20, s6 -; GISEL-NEXT: v_mov_b32_e32 v19, s5 -; GISEL-NEXT: v_mov_b32_e32 v18, s4 +; GISEL-NEXT: v_mov_b32_e32 v21, s11 +; GISEL-NEXT: v_mov_b32_e32 v20, s10 +; GISEL-NEXT: v_mov_b32_e32 v19, s9 +; GISEL-NEXT: v_mov_b32_e32 v18, s8 ; GISEL-NEXT: .LBB1_3: ; %udiv-do-while3 ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v23 -; GISEL-NEXT: v_lshl_b64 v[20:21], v[22:23], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v25 -; GISEL-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 -; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v35, 31, v3 ; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v26 ; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc +; GISEL-NEXT: v_lshl_b64 v[20:21], v[24:25], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v17 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; GISEL-NEXT: v_or_b32_e32 v20, v20, v0 +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v3 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GISEL-NEXT: v_or_b32_e32 v22, v18, v20 -; GISEL-NEXT: v_or_b32_e32 v23, v19, v21 -; GISEL-NEXT: v_or_b32_e32 v16, v16, v0 -; GISEL-NEXT: v_or_b32_e32 v20, v24, v35 +; GISEL-NEXT: v_or_b32_e32 v0, v16, v0 +; GISEL-NEXT: v_lshrrev_b32_e32 v16, 31, v23 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 ; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc ; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v30, v20 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v31, v25, vcc +; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 +; GISEL-NEXT: v_or_b32_e32 v22, v18, v22 +; GISEL-NEXT: v_or_b32_e32 v23, v19, v23 ; GISEL-NEXT: v_or_b32_e32 v18, v26, v28 ; GISEL-NEXT: v_or_b32_e32 v19, v27, v29 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v32, v16, vcc -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v33, v17, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] -; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v0 -; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v18, v0, v8 -; GISEL-NEXT: v_and_b32_e32 v19, v0, v9 -; GISEL-NEXT: v_and_b32_e32 v21, v0, v10 -; GISEL-NEXT: v_and_b32_e32 v35, v0, v11 -; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 -; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v20, v18 -; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v25, v19, vcc -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v21, vcc -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc -; GISEL-NEXT: v_or_b32_e32 v2, v2, v34 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v30, v0 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v31, v17, vcc +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v32, v20, vcc +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v33, v21, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v16 +; GISEL-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GISEL-NEXT: v_and_b32_e32 v16, v18, v8 +; GISEL-NEXT: v_and_b32_e32 v19, v18, v9 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v16 +; GISEL-NEXT: v_and_b32_e32 v24, v18, v10 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v19, vcc +; GISEL-NEXT: v_and_b32_e32 v19, v18, v11 +; GISEL-NEXT: v_and_b32_e32 v0, 1, v18 +; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v20, v24, vcc +; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v21, v19, vcc ; GISEL-NEXT: v_mov_b32_e32 v19, v1 ; GISEL-NEXT: v_mov_b32_e32 v18, v0 -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execnz .LBB1_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB1_5: ; %Flow14 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB1_5: ; %Flow14 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_lshl_b64 v[0:1], v[22:23], 1 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v8, 31, v23 @@ -1371,7 +1371,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v18, v18, v0 ; GISEL-NEXT: v_or_b32_e32 v19, v19, v1 ; GISEL-NEXT: .LBB1_6: ; %Flow16 -; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_or_b32_e32 v0, v12, v14 ; GISEL-NEXT: v_or_b32_e32 v1, v13, v15 @@ -1563,8 +1563,8 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3 ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v29, v28 ; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc @@ -1575,63 +1575,63 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v18, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v18, v16 ; SDAG-NEXT: v_ffbh_u32_e32 v20, v17 -; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8 ; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v2, v16, v0 -; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18 +; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8 ; SDAG-NEXT: v_ffbh_u32_e32 v22, v0 -; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18 ; SDAG-NEXT: v_or_b32_e32 v3, v17, v1 +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc +; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], 32, v22 ; SDAG-NEXT: v_min_u32_e32 v18, v18, v20 -; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v22 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v1 -; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[4:5] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] -; SDAG-NEXT: v_min_u32_e32 v3, v20, v22 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v1 +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] +; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[6:7] +; SDAG-NEXT: v_min_u32_e32 v3, v22, v20 ; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18 -; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v9, s[4:5] +; SDAG-NEXT: v_addc_u32_e64 v9, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v10, v8, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v10, v9, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v20, v8, v3, vcc ; SDAG-NEXT: v_ffbh_u32_e32 v9, v31 ; SDAG-NEXT: v_ffbh_u32_e32 v21, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v18, s[6:7] ; SDAG-NEXT: v_or_b32_e32 v8, v31, v2 ; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v9 -; SDAG-NEXT: v_ffbh_u32_e32 v20, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v2 ; SDAG-NEXT: v_or_b32_e32 v9, v30, v3 ; SDAG-NEXT: v_min_u32_e32 v11, v11, v21 -; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v20 +; SDAG-NEXT: v_add_i32_e32 v18, vcc, 32, v18 ; SDAG-NEXT: v_ffbh_u32_e32 v21, v3 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_min_u32_e32 v8, v20, v21 -; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v11 -; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[4:5] -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10 -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc -; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10 +; SDAG-NEXT: v_min_u32_e32 v8, v18, v21 +; SDAG-NEXT: v_add_i32_e64 v9, s[6:7], 64, v11 +; SDAG-NEXT: v_addc_u32_e64 v11, s[6:7], 0, 0, s[6:7] +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[6:7] +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v20 +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v10, vcc +; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v8 ; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v19, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v19, vcc -; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 -; SDAG-NEXT: v_or_b32_e32 v9, v11, v19 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 +; SDAG-NEXT: v_or_b32_e32 v11, v9, v19 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v21, v20, s[4:5] -; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v10, v21, v20, s[4:5] +; SDAG-NEXT: v_and_b32_e32 v10, 1, v10 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v34, v1, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 @@ -1642,91 +1642,91 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB2_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v10 -; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v10 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 -; SDAG-NEXT: v_mov_b32_e32 v9, 0 -; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc +; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v8 +; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc ; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20 ; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc ; SDAG-NEXT: v_or_b32_e32 v18, v32, v34 -; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v8 ; SDAG-NEXT: v_or_b32_e32 v19, v33, v35 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[0:1], v24 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[0:1], v24 ; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24 ; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v24 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25 -; SDAG-NEXT: v_or_b32_e32 v11, v11, v19 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 +; SDAG-NEXT: v_or_b32_e32 v9, v9, v19 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 -; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v21, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v22, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v18, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v18, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v20, v0, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 ; SDAG-NEXT: v_lshr_b64 v[22:23], v[16:17], v32 -; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 64, v32 +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 64, v32 ; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32 ; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32 ; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31 -; SDAG-NEXT: v_mov_b32_e32 v18, 0 -; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v9, 0 -; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v8 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v10 ; SDAG-NEXT: v_lshr_b64 v[48:49], v[0:1], v37 ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc -; SDAG-NEXT: v_or_b32_e32 v8, v23, v27 +; SDAG-NEXT: v_or_b32_e32 v10, v23, v27 ; SDAG-NEXT: v_or_b32_e32 v22, v22, v26 ; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v2, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32 -; SDAG-NEXT: v_cndmask_b32_e64 v8, v49, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v49, v10, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v22, v48, v22, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v27, 0, v25, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v26, 0, v24, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v3, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 -; SDAG-NEXT: v_cndmask_b32_e32 v25, v8, v17, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v25, v10, v17, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v24, v22, v16, vcc ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21 -; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 ; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v25 +; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v25 ; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v11 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 -; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 -; SDAG-NEXT: v_or_b32_e32 v22, v26, v48 -; SDAG-NEXT: v_or_b32_e32 v23, v24, v49 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v8 -; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v36, v23 -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v25, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v38, v22, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v39, v27, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; SDAG-NEXT: v_and_b32_e32 v24, v8, v31 -; SDAG-NEXT: v_and_b32_e32 v26, v8, v30 -; SDAG-NEXT: v_and_b32_e32 v48, v8, v2 -; SDAG-NEXT: v_and_b32_e32 v49, v8, v3 -; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 -; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v23, v24 -; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v26, vcc -; SDAG-NEXT: v_subb_u32_e32 v26, vcc, v22, v48, vcc +; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v19 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v9 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; SDAG-NEXT: v_or_b32_e32 v10, v26, v10 +; SDAG-NEXT: v_or_b32_e32 v24, v24, v48 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v49 +; SDAG-NEXT: v_or_b32_e32 v9, v23, v9 +; SDAG-NEXT: v_or_b32_e32 v8, v22, v8 +; SDAG-NEXT: v_or_b32_e32 v19, v21, v19 +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v36, v24 +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v37, v25, vcc +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v38, v10, vcc +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v39, v27, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v48, 31, v22 +; SDAG-NEXT: v_and_b32_e32 v22, v48, v31 +; SDAG-NEXT: v_and_b32_e32 v23, v48, v30 +; SDAG-NEXT: v_and_b32_e32 v26, v48, v2 +; SDAG-NEXT: v_and_b32_e32 v49, v48, v3 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v24, v22 +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v23, vcc +; SDAG-NEXT: v_subb_u32_e32 v26, vcc, v10, v26, vcc ; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v49, vcc ; SDAG-NEXT: v_add_i32_e32 v32, vcc, -1, v32 ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc @@ -1735,25 +1735,25 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v22, v32, v34 ; SDAG-NEXT: v_or_b32_e32 v23, v33, v35 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23] -; SDAG-NEXT: v_or_b32_e32 v11, v19, v11 +; SDAG-NEXT: v_and_b32_e32 v10, 1, v48 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v10, v18, v10 -; SDAG-NEXT: v_mov_b32_e32 v23, v9 -; SDAG-NEXT: v_mov_b32_e32 v22, v8 +; SDAG-NEXT: v_or_b32_e32 v18, v20, v18 +; SDAG-NEXT: v_mov_b32_e32 v23, v11 +; SDAG-NEXT: v_mov_b32_e32 v22, v10 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB2_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB2_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21 -; SDAG-NEXT: v_or_b32_e32 v10, v10, v22 -; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v34, v19, v11 -; SDAG-NEXT: v_or_b32_e32 v32, v18, v10 -; SDAG-NEXT: v_or_b32_e32 v27, v9, v21 -; SDAG-NEXT: v_or_b32_e32 v33, v8, v20 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v9 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v22 +; SDAG-NEXT: v_or_b32_e32 v34, v21, v19 +; SDAG-NEXT: v_or_b32_e32 v27, v11, v9 +; SDAG-NEXT: v_or_b32_e32 v32, v20, v18 +; SDAG-NEXT: v_or_b32_e32 v33, v10, v8 ; SDAG-NEXT: .LBB2_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7 @@ -1880,8 +1880,8 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_mov_b32_e32 v13, 0 ; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v12 ; SDAG-NEXT: v_lshr_b64 v[53:54], v[4:5], v51 -; SDAG-NEXT: v_addc_u32_e32 v51, vcc, -1, v36, vcc ; SDAG-NEXT: v_or_b32_e32 v12, v21, v25 +; SDAG-NEXT: v_addc_u32_e32 v51, vcc, -1, v36, vcc ; SDAG-NEXT: v_or_b32_e32 v20, v20, v24 ; SDAG-NEXT: v_addc_u32_e32 v52, vcc, -1, v6, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v38 @@ -1899,40 +1899,40 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v12, 31, v23 +; SDAG-NEXT: v_or_b32_e32 v12, v24, v12 ; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v54, 31, v15 -; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v55, 31, v11 -; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; SDAG-NEXT: v_or_b32_e32 v24, v24, v12 -; SDAG-NEXT: v_or_b32_e32 v22, v22, v54 -; SDAG-NEXT: v_or_b32_e32 v12, v14, v55 -; SDAG-NEXT: v_or_b32_e32 v15, v19, v15 -; SDAG-NEXT: v_or_b32_e32 v11, v21, v11 -; SDAG-NEXT: v_or_b32_e32 v14, v18, v12 -; SDAG-NEXT: v_sub_i32_e32 v12, vcc, v50, v22 -; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v51, v23, vcc -; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v52, v24, vcc -; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v53, v25, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v12 -; SDAG-NEXT: v_and_b32_e32 v12, 1, v21 -; SDAG-NEXT: v_and_b32_e32 v54, v21, v7 -; SDAG-NEXT: v_and_b32_e32 v55, v21, v6 -; SDAG-NEXT: v_and_b32_e32 v40, v21, v36 -; SDAG-NEXT: v_and_b32_e32 v21, v21, v37 -; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v22, v21 -; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v40, vcc -; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v24, v55, vcc -; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v54, vcc +; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v15 +; SDAG-NEXT: v_or_b32_e32 v22, v22, v24 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v50, v22 +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v51, v23, vcc +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v52, v12, vcc +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v53, v25, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v40, 31, v24 +; SDAG-NEXT: v_and_b32_e32 v24, v40, v37 +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v22, v24 +; SDAG-NEXT: v_and_b32_e32 v24, v40, v36 +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v24, vcc +; SDAG-NEXT: v_and_b32_e32 v24, v40, v6 +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v12, v24, vcc +; SDAG-NEXT: v_and_b32_e32 v12, v40, v7 +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v12, vcc ; SDAG-NEXT: v_add_i32_e32 v38, vcc, -1, v38 ; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v39, vcc ; SDAG-NEXT: v_addc_u32_e32 v48, vcc, -1, v48, vcc ; SDAG-NEXT: v_addc_u32_e32 v49, vcc, -1, v49, vcc -; SDAG-NEXT: v_or_b32_e32 v55, v39, v49 ; SDAG-NEXT: v_or_b32_e32 v54, v38, v48 +; SDAG-NEXT: v_or_b32_e32 v55, v39, v49 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[54:55] -; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v12, 31, v11 +; SDAG-NEXT: v_or_b32_e32 v14, v14, v12 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_or_b32_e32 v11, v21, v11 ; SDAG-NEXT: v_or_b32_e32 v10, v20, v10 +; SDAG-NEXT: v_or_b32_e32 v15, v19, v15 +; SDAG-NEXT: v_and_b32_e32 v12, 1, v40 +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v14, v18, v14 ; SDAG-NEXT: v_mov_b32_e32 v21, v13 ; SDAG-NEXT: v_mov_b32_e32 v20, v12 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] @@ -2021,33 +2021,33 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-LABEL: v_srem_v2i128_vv: ; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_ashrrev_i32_e32 v28, 31, v3 +; GISEL-NEXT: v_ashrrev_i32_e32 v29, 31, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v11 ; GISEL-NEXT: v_mov_b32_e32 v19, 0x7f ; GISEL-NEXT: v_mov_b32_e32 v20, 0 ; GISEL-NEXT: s_mov_b64 s[8:9], 0 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v28 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28 -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v29 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v29 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v29 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v29 ; GISEL-NEXT: v_xor_b32_e32 v8, v8, v18 ; GISEL-NEXT: v_xor_b32_e32 v9, v9, v18 ; GISEL-NEXT: v_xor_b32_e32 v10, v10, v18 ; GISEL-NEXT: v_xor_b32_e32 v11, v11, v18 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v28 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v1, v28, vcc -; GISEL-NEXT: v_sub_i32_e64 v30, s[4:5], v8, v18 -; GISEL-NEXT: v_subb_u32_e64 v29, s[4:5], v9, v18, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v2, v28, vcc -; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v3, v28, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v29 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v1, v29, vcc +; GISEL-NEXT: v_sub_i32_e64 v31, s[4:5], v8, v18 +; GISEL-NEXT: v_subb_u32_e64 v30, s[4:5], v9, v18, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v2, v29, vcc +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v3, v29, vcc ; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v18, s[4:5] ; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc -; GISEL-NEXT: v_ffbh_u32_e32 v18, v29 -; GISEL-NEXT: v_ffbh_u32_e32 v21, v30 +; GISEL-NEXT: v_ffbh_u32_e32 v18, v30 +; GISEL-NEXT: v_ffbh_u32_e32 v21, v31 ; GISEL-NEXT: v_ffbh_u32_e32 v22, v17 ; GISEL-NEXT: v_ffbh_u32_e32 v23, v16 -; GISEL-NEXT: v_or_b32_e32 v0, v30, v10 -; GISEL-NEXT: v_or_b32_e32 v1, v29, v11 +; GISEL-NEXT: v_or_b32_e32 v0, v31, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v30, v11 ; GISEL-NEXT: v_or_b32_e32 v2, v16, v8 ; GISEL-NEXT: v_or_b32_e32 v3, v17, v9 ; GISEL-NEXT: v_add_i32_e32 v21, vcc, 32, v21 @@ -2092,126 +2092,126 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v19, 1, v19 ; GISEL-NEXT: v_and_b32_e32 v18, 1, v18 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v31, v16, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v28, v16, 0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v18, v8, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v19, v9, 0, vcc ; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v19, v9, 0, vcc +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB2_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 -; GISEL-NEXT: v_add_i32_e32 v31, vcc, 1, v2 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v2 ; GISEL-NEXT: v_addc_u32_e64 v32, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v2 +; GISEL-NEXT: v_sub_i32_e32 v25, vcc, 0x7f, v2 ; GISEL-NEXT: v_not_b32_e32 v2, 63 ; GISEL-NEXT: v_addc_u32_e64 v33, vcc, 0, v0, s[4:5] ; GISEL-NEXT: v_addc_u32_e32 v34, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v24, v2 -; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 64, v24 -; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v24 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[8:9], v24 +; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v25, v2 +; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 64, v25 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v25 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[8:9], v25 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: v_lshr_b64 v[18:19], v[16:17], v18 -; GISEL-NEXT: v_lshl_b64 v[22:23], v[16:17], v20 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc -; GISEL-NEXT: v_or_b32_e32 v0, v18, v2 -; GISEL-NEXT: v_or_b32_e32 v1, v19, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GISEL-NEXT: v_cndmask_b32_e32 v18, v0, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v19, v1, v9, vcc +; GISEL-NEXT: v_lshr_b64 v[21:22], v[16:17], v20 +; GISEL-NEXT: v_lshl_b64 v[23:24], v[16:17], v19 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v25 +; GISEL-NEXT: v_cndmask_b32_e32 v19, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v1, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v21, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v22, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v23, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v24, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 +; GISEL-NEXT: v_cndmask_b32_e32 v21, v0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v22, v1, v9, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9] ; GISEL-NEXT: s_cbranch_execz .LBB2_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 -; GISEL-NEXT: v_add_i32_e32 v24, vcc, 0xffffffc0, v31 -; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v31 -; GISEL-NEXT: v_lshr_b64 v[0:1], v[8:9], v31 -; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v31 -; GISEL-NEXT: s_mov_b64 s[4:5], 0 -; GISEL-NEXT: v_add_i32_e32 v35, vcc, -1, v30 -; GISEL-NEXT: v_addc_u32_e32 v36, vcc, -1, v29, vcc -; GISEL-NEXT: v_lshl_b64 v[22:23], v[8:9], v22 -; GISEL-NEXT: v_lshr_b64 v[24:25], v[8:9], v24 +; GISEL-NEXT: v_add_i32_e32 v25, vcc, 0xffffffc0, v18 +; GISEL-NEXT: v_sub_i32_e32 v23, vcc, 64, v18 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[8:9], v18 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v18 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_add_i32_e32 v35, vcc, -1, v31 +; GISEL-NEXT: v_addc_u32_e32 v36, vcc, -1, v30, vcc +; GISEL-NEXT: v_lshl_b64 v[23:24], v[8:9], v23 +; GISEL-NEXT: v_lshr_b64 v[25:26], v[8:9], v25 ; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v10, vcc ; GISEL-NEXT: v_addc_u32_e32 v38, vcc, -1, v11, vcc -; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 -; GISEL-NEXT: v_or_b32_e32 v3, v3, v23 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v31 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v24, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 -; GISEL-NEXT: v_cndmask_b32_e32 v26, v2, v16, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v27, v3, v17, vcc -; GISEL-NEXT: v_mov_b32_e32 v23, 0 -; GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GISEL-NEXT: v_mov_b32_e32 v2, s6 -; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_or_b32_e32 v2, v2, v23 +; GISEL-NEXT: v_or_b32_e32 v3, v3, v24 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v25, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v26, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v27, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v28, 0, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_cndmask_b32_e32 v25, v2, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v26, v3, v17, vcc +; GISEL-NEXT: v_mov_b32_e32 v24, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-NEXT: v_mov_b32_e32 v2, s10 +; GISEL-NEXT: v_mov_b32_e32 v3, s11 ; GISEL-NEXT: .LBB2_3: ; %udiv-do-while3 ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_lshrrev_b32_e32 v39, 31, v21 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v27 -; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; GISEL-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v48, 31, v19 -; GISEL-NEXT: v_add_i32_e32 v31, vcc, -1, v31 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, -1, v18 ; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc -; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 -; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v2, v24, v22 -; GISEL-NEXT: v_or_b32_e32 v3, v26, v48 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[27:28], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v23, 31, v26 +; GISEL-NEXT: v_lshl_b64 v[25:26], v[25:26], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v27, 31, v22 +; GISEL-NEXT: v_lshl_b64 v[21:22], v[21:22], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v28, 31, v20 +; GISEL-NEXT: v_lshl_b64 v[19:20], v[19:20], 1 ; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc ; GISEL-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v35, v3 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v36, v27, vcc -; GISEL-NEXT: v_or_b32_e32 v0, v31, v33 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v23 +; GISEL-NEXT: v_or_b32_e32 v25, v25, v27 +; GISEL-NEXT: v_or_b32_e32 v21, v21, v28 +; GISEL-NEXT: v_or_b32_e32 v19, v0, v19 +; GISEL-NEXT: v_or_b32_e32 v20, v1, v20 +; GISEL-NEXT: v_or_b32_e32 v0, v18, v33 ; GISEL-NEXT: v_or_b32_e32 v1, v32, v34 -; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v37, v2, vcc -; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v38, v25, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v22 -; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v1, v0, v30 -; GISEL-NEXT: v_and_b32_e32 v24, v0, v29 -; GISEL-NEXT: v_and_b32_e32 v48, v0, v10 -; GISEL-NEXT: v_and_b32_e32 v49, v0, v11 -; GISEL-NEXT: v_and_b32_e32 v22, 1, v0 -; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v3, v1 -; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v24, vcc -; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v2, v48, vcc -; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v25, v49, vcc -; GISEL-NEXT: v_or_b32_e32 v18, v18, v39 -; GISEL-NEXT: v_mov_b32_e32 v0, v22 -; GISEL-NEXT: v_mov_b32_e32 v1, v23 -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v23, vcc, v35, v25 +; GISEL-NEXT: v_subb_u32_e32 v23, vcc, v36, v26, vcc +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v37, v2, vcc +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v38, v3, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GISEL-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GISEL-NEXT: v_and_b32_e32 v1, v0, v31 +; GISEL-NEXT: v_and_b32_e32 v27, v0, v30 +; GISEL-NEXT: v_and_b32_e32 v28, v0, v10 +; GISEL-NEXT: v_and_b32_e32 v39, v0, v11 +; GISEL-NEXT: v_and_b32_e32 v23, 1, v0 +; GISEL-NEXT: v_sub_i32_e32 v25, vcc, v25, v1 +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v26, v27, vcc +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v2, v28, vcc +; GISEL-NEXT: v_subb_u32_e32 v28, vcc, v3, v39, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v23 +; GISEL-NEXT: v_mov_b32_e32 v1, v24 +; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execnz .LBB2_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB2_5: ; %Flow14 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 -; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v20, 31, v21 +; GISEL-NEXT: .LBB2_5: ; %Flow14 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_lshl_b64 v[2:3], v[19:20], 1 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[21:22], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v20, 31, v20 ; GISEL-NEXT: v_or_b32_e32 v18, v18, v20 -; GISEL-NEXT: v_or_b32_e32 v31, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v28, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v32, v1, v3 ; GISEL-NEXT: .LBB2_6: ; %Flow16 -; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v33, 31, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v15 @@ -2289,7 +2289,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB2_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 ; GISEL-NEXT: v_add_i32_e32 v36, vcc, 1, v14 @@ -2298,44 +2298,44 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_not_b32_e32 v2, 63 ; GISEL-NEXT: v_addc_u32_e64 v38, vcc, 0, v0, s[4:5] ; GISEL-NEXT: v_addc_u32_e32 v39, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v24, v2 -; GISEL-NEXT: v_sub_i32_e64 v14, s[4:5], 64, v24 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v24, v2 +; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 64, v24 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[12:13], v24 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[6:7], v24 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: v_lshr_b64 v[14:15], v[12:13], v14 -; GISEL-NEXT: v_lshl_b64 v[22:23], v[12:13], v20 +; GISEL-NEXT: v_lshr_b64 v[20:21], v[12:13], v15 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[12:13], v14 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc -; GISEL-NEXT: v_or_b32_e32 v0, v14, v2 -; GISEL-NEXT: v_or_b32_e32 v1, v15, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v1, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v20, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v21, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v0, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v15, v1, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v20, v0, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v21, v1, v7, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 ; GISEL-NEXT: v_mov_b32_e32 v2, s10 ; GISEL-NEXT: v_mov_b32_e32 v3, s11 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9] ; GISEL-NEXT: s_cbranch_execz .LBB2_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader ; GISEL-NEXT: v_add_i32_e32 v24, vcc, 0xffffffc0, v36 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v36 ; GISEL-NEXT: v_lshr_b64 v[0:1], v[6:7], v36 ; GISEL-NEXT: v_lshr_b64 v[2:3], v[12:13], v36 -; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_add_i32_e32 v48, vcc, -1, v35 ; GISEL-NEXT: v_addc_u32_e32 v49, vcc, -1, v34, vcc ; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], v22 ; GISEL-NEXT: v_lshr_b64 v[24:25], v[6:7], v24 ; GISEL-NEXT: v_addc_u32_e32 v50, vcc, -1, v4, vcc ; GISEL-NEXT: v_addc_u32_e32 v51, vcc, -1, v5, vcc -; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 ; GISEL-NEXT: v_or_b32_e32 v3, v3, v23 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v36 @@ -2347,108 +2347,108 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v24, v2, v12, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v25, v3, v13, vcc ; GISEL-NEXT: v_mov_b32_e32 v23, 0 -; GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GISEL-NEXT: v_mov_b32_e32 v2, s6 -; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-NEXT: v_mov_b32_e32 v2, s10 +; GISEL-NEXT: v_mov_b32_e32 v3, s11 ; GISEL-NEXT: .LBB2_9: ; %udiv-do-while ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v21 -; GISEL-NEXT: v_lshl_b64 v[52:53], v[24:25], 1 -; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v25 -; GISEL-NEXT: v_lshrrev_b32_e32 v25, 31, v15 -; GISEL-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 ; GISEL-NEXT: v_add_i32_e32 v36, vcc, -1, v36 ; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc -; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v2, v26, v24 -; GISEL-NEXT: v_or_b32_e32 v3, v52, v25 -; GISEL-NEXT: v_or_b32_e32 v14, v14, v22 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[26:27], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v25 +; GISEL-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v21 +; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v27, 31, v15 +; GISEL-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 ; GISEL-NEXT: v_addc_u32_e32 v38, vcc, -1, v38, vcc ; GISEL-NEXT: v_addc_u32_e32 v39, vcc, -1, v39, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v48, v3 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v49, v53, vcc +; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 +; GISEL-NEXT: v_or_b32_e32 v24, v24, v26 +; GISEL-NEXT: v_or_b32_e32 v20, v20, v27 +; GISEL-NEXT: v_or_b32_e32 v14, v0, v14 +; GISEL-NEXT: v_or_b32_e32 v15, v1, v15 ; GISEL-NEXT: v_or_b32_e32 v0, v36, v38 ; GISEL-NEXT: v_or_b32_e32 v1, v37, v39 -; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v50, v2, vcc -; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v51, v27, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v22 -; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v22, 1, v0 +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, v48, v24 +; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v49, v25, vcc +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v50, v2, vcc +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v51, v3, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GISEL-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] ; GISEL-NEXT: v_and_b32_e32 v1, v0, v35 -; GISEL-NEXT: v_and_b32_e32 v25, v0, v34 -; GISEL-NEXT: v_and_b32_e32 v26, v0, v4 +; GISEL-NEXT: v_and_b32_e32 v26, v0, v34 +; GISEL-NEXT: v_and_b32_e32 v27, v0, v4 ; GISEL-NEXT: v_and_b32_e32 v52, v0, v5 -; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1 -; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v53, v25, vcc +; GISEL-NEXT: v_and_b32_e32 v22, 1, v0 +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v24, v1 +; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v25, v26, vcc +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v27, vcc +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v3, v52, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v22 ; GISEL-NEXT: v_mov_b32_e32 v1, v23 -; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc -; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v52, vcc -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execnz .LBB2_9 ; GISEL-NEXT: ; %bb.10: ; %Flow -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB2_11: ; %Flow11 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], 1 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v21 +; GISEL-NEXT: .LBB2_11: ; %Flow11 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_lshl_b64 v[22:23], v[14:15], 1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v15 ; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 ; GISEL-NEXT: v_or_b32_e32 v20, v0, v22 ; GISEL-NEXT: v_or_b32_e32 v21, v1, v23 ; GISEL-NEXT: .LBB2_12: ; %Flow12 -; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0 -; GISEL-NEXT: v_mul_lo_u32 v26, v30, v19 -; GISEL-NEXT: v_mul_lo_u32 v27, v29, v18 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v31, v28, 0 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v31, v18, 0 +; GISEL-NEXT: v_mul_lo_u32 v26, v31, v19 +; GISEL-NEXT: v_mul_lo_u32 v27, v30, v18 ; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v20, 0 ; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v2, 0 ; GISEL-NEXT: v_mul_lo_u32 v36, v35, v3 ; GISEL-NEXT: v_mul_lo_u32 v37, v34, v2 -; GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v29, v32, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v30, v32, v[14:15] ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[18:19] -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[24:25] +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v28, v[24:25] ; GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v4, v20, v[14:15] -; GISEL-NEXT: v_mad_u64_u32 v[14:15], vcc, v30, v32, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], vcc, v31, v32, v[1:2] ; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[23:24] -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v29, v31, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v30, v28, v[14:15] ; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v3, v26, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v34, v20, v[1:2] ; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], v25, v36, s[6:7] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v27, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0 ; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v17, v18, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v28 +; GISEL-NEXT: v_xor_b32_e32 v18, v0, v29 ; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v2, v37, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v12, v22 ; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v14, s[4:5] ; GISEL-NEXT: v_xor_b32_e32 v22, v2, v33 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v32, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v16, v28 +; GISEL-NEXT: v_xor_b32_e32 v1, v16, v29 ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v4, v21, v[0:1] ; GISEL-NEXT: v_xor_b32_e32 v10, v14, v33 -; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v11, v31, v[2:3] -; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v18, v28 -; GISEL-NEXT: v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v11, v28, v[2:3] +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v18, v29 +; GISEL-NEXT: v_subb_u32_e64 v1, s[6:7], v1, v29, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v20, v[12:13] ; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v22, v33 ; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v10, v33, s[8:9] ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v8, v19, vcc ; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v16, vcc -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v29 ; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v15, s[4:5] ; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v7, v2, vcc ; GISEL-NEXT: v_xor_b32_e32 v6, v6, v33 -; GISEL-NEXT: v_xor_b32_e32 v7, v8, v28 +; GISEL-NEXT: v_xor_b32_e32 v7, v8, v29 ; GISEL-NEXT: v_xor_b32_e32 v8, v2, v33 -; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v3, v28, s[6:7] -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v28, vcc +; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v3, v29, s[6:7] +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v29, vcc ; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v33, s[8:9] ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v8, v33, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2495,22 +2495,22 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc -; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 -; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v20, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v18 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16 ; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[18:19] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 -; SDAG-NEXT: v_or_b32_e32 v17, v19, v21 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 +; SDAG-NEXT: v_or_b32_e32 v19, v17, v21 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[20:21] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v23, v22, s[4:5] -; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v23, v22, s[4:5] +; SDAG-NEXT: v_and_b32_e32 v18, 1, v18 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v33, v3, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 @@ -2521,91 +2521,91 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] ; SDAG-NEXT: s_cbranch_execz .LBB3_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 -; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v18 -; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v18 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: v_mov_b32_e32 v17, 0 -; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v19, vcc +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v16 +; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v17, vcc ; SDAG-NEXT: v_lshl_b64 v[22:23], v[0:1], v22 ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v20, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v21, vcc -; SDAG-NEXT: v_or_b32_e32 v19, v30, v32 -; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0x7f, v18 -; SDAG-NEXT: v_or_b32_e32 v20, v31, v33 -; SDAG-NEXT: v_lshl_b64 v[24:25], v[2:3], v21 -; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v21 -; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v21 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20] -; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v18 -; SDAG-NEXT: v_or_b32_e32 v19, v25, v19 -; SDAG-NEXT: v_or_b32_e32 v18, v24, v18 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v21 -; SDAG-NEXT: v_cndmask_b32_e64 v19, v23, v19, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v18, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v27, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v26, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v21 -; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v3, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v2, s[4:5] -; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_or_b32_e32 v20, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 +; SDAG-NEXT: v_or_b32_e32 v21, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[2:3], v26 +; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v26 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v27 +; SDAG-NEXT: v_or_b32_e32 v17, v17, v21 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v20, v23, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v24, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v21, v20, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v22, v2, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB3_5 ; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 ; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v30 -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v30 +; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v30 ; SDAG-NEXT: v_subrev_i32_e32 v35, vcc, 64, v30 ; SDAG-NEXT: v_lshr_b64 v[26:27], v[2:3], v30 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v8 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v17, 0 -; SDAG-NEXT: v_lshl_b64 v[28:29], v[2:3], v16 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_lshl_b64 v[28:29], v[2:3], v18 ; SDAG-NEXT: v_lshr_b64 v[37:38], v[2:3], v35 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v9, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v25, v29 +; SDAG-NEXT: v_or_b32_e32 v18, v25, v29 ; SDAG-NEXT: v_or_b32_e32 v24, v24, v28 ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v10, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v16, v38, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v38, v18, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v24, v37, v24, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v11, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; SDAG-NEXT: v_cndmask_b32_e32 v27, v16, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v27, v18, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v26, v24, v0, vcc ; SDAG-NEXT: v_mov_b32_e32 v24, 0 ; SDAG-NEXT: v_mov_b32_e32 v25, 0 ; SDAG-NEXT: .LBB3_3: ; %udiv-do-while3 ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v23 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 ; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v27 +; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v27 ; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v19 -; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 -; SDAG-NEXT: v_or_b32_e32 v23, v25, v23 -; SDAG-NEXT: v_or_b32_e32 v22, v24, v22 -; SDAG-NEXT: v_or_b32_e32 v24, v28, v38 -; SDAG-NEXT: v_or_b32_e32 v25, v26, v39 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v16 -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v34, v25 -; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v35, v27, vcc -; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v36, v24, vcc -; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v29, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v16 -; SDAG-NEXT: v_and_b32_e32 v26, v16, v8 -; SDAG-NEXT: v_and_b32_e32 v28, v16, v9 -; SDAG-NEXT: v_and_b32_e32 v38, v16, v10 -; SDAG-NEXT: v_and_b32_e32 v39, v16, v11 -; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v25, v26 -; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v28, vcc -; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v24, v38, vcc +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_or_b32_e32 v18, v28, v18 +; SDAG-NEXT: v_or_b32_e32 v26, v26, v38 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v39 +; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 +; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 +; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v34, v26 +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v35, v27, vcc +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v36, v18, vcc +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v37, v29, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v24 +; SDAG-NEXT: v_and_b32_e32 v24, v38, v8 +; SDAG-NEXT: v_and_b32_e32 v25, v38, v9 +; SDAG-NEXT: v_and_b32_e32 v28, v38, v10 +; SDAG-NEXT: v_and_b32_e32 v39, v38, v11 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v24 +; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v25, vcc +; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v18, v28, vcc ; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v39, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc @@ -2614,25 +2614,25 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_or_b32_e32 v24, v30, v32 ; SDAG-NEXT: v_or_b32_e32 v25, v31, v33 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25] -; SDAG-NEXT: v_or_b32_e32 v19, v21, v19 +; SDAG-NEXT: v_and_b32_e32 v18, 1, v38 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v18, v20, v18 -; SDAG-NEXT: v_mov_b32_e32 v25, v17 -; SDAG-NEXT: v_mov_b32_e32 v24, v16 +; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 +; SDAG-NEXT: v_mov_b32_e32 v25, v19 +; SDAG-NEXT: v_mov_b32_e32 v24, v18 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB3_3 ; SDAG-NEXT: ; %bb.4: ; %Flow13 ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB3_5: ; %Flow14 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v23 -; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v24 -; SDAG-NEXT: v_or_b32_e32 v33, v21, v19 -; SDAG-NEXT: v_or_b32_e32 v30, v17, v23 -; SDAG-NEXT: v_or_b32_e32 v31, v20, v18 -; SDAG-NEXT: v_or_b32_e32 v32, v16, v22 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v24 +; SDAG-NEXT: v_or_b32_e32 v33, v23, v21 +; SDAG-NEXT: v_or_b32_e32 v30, v19, v17 +; SDAG-NEXT: v_or_b32_e32 v31, v22, v20 +; SDAG-NEXT: v_or_b32_e32 v32, v18, v16 ; SDAG-NEXT: .LBB3_6: ; %Flow16 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: v_or_b32_e32 v17, v13, v15 @@ -2670,58 +2670,58 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc -; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 -; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v20, v17, vcc -; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v18 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16 ; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[18:19] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v17, v19, v21 +; SDAG-NEXT: v_or_b32_e32 v19, v17, v21 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_and_b32_e32 v16, 1, v22 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_and_b32_e32 v18, 1, v22 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18 ; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v23, v7, 0, s[4:5] ; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; SDAG-NEXT: v_cndmask_b32_e64 v22, v6, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v17, v5, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v4, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5] ; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB3_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 -; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v18 -; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v18 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: v_mov_b32_e32 v17, 0 -; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc +; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v16 +; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v17, vcc ; SDAG-NEXT: v_lshl_b64 v[22:23], v[4:5], v22 ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v20, vcc ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v21, vcc -; SDAG-NEXT: v_or_b32_e32 v19, v34, v36 -; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v18 -; SDAG-NEXT: v_or_b32_e32 v20, v35, v37 -; SDAG-NEXT: v_lshl_b64 v[24:25], v[6:7], v28 -; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v28 -; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v28 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20] -; SDAG-NEXT: v_lshr_b64 v[18:19], v[4:5], v18 -; SDAG-NEXT: v_or_b32_e32 v19, v25, v19 -; SDAG-NEXT: v_or_b32_e32 v18, v24, v18 -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28 -; SDAG-NEXT: v_cndmask_b32_e64 v19, v23, v19, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v18, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v27, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v26, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28 -; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v6, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v20, v34, v36 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 +; SDAG-NEXT: v_or_b32_e32 v21, v35, v37 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[6:7], v26 +; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v26 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v27 +; SDAG-NEXT: v_or_b32_e32 v17, v17, v21 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v20, v23, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v24, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v21, v20, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v22, v6, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -2729,131 +2729,131 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_cbranch_execz .LBB3_11 ; SDAG-NEXT: ; %bb.8: ; %udiv-preheader ; SDAG-NEXT: v_lshr_b64 v[24:25], v[4:5], v34 -; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v34 +; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v34 ; SDAG-NEXT: v_subrev_i32_e32 v39, vcc, 64, v34 ; SDAG-NEXT: v_lshr_b64 v[26:27], v[6:7], v34 ; SDAG-NEXT: v_add_i32_e32 v38, vcc, -1, v12 ; SDAG-NEXT: v_mov_b32_e32 v22, 0 ; SDAG-NEXT: v_mov_b32_e32 v23, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v17, 0 -; SDAG-NEXT: v_lshl_b64 v[28:29], v[6:7], v16 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_lshl_b64 v[28:29], v[6:7], v18 ; SDAG-NEXT: v_lshr_b64 v[49:50], v[6:7], v39 ; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v16, v25, v29 +; SDAG-NEXT: v_or_b32_e32 v18, v25, v29 ; SDAG-NEXT: v_or_b32_e32 v24, v24, v28 ; SDAG-NEXT: v_addc_u32_e32 v48, vcc, -1, v14, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v16, v50, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v50, v18, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v24, v49, v24, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v49, vcc, -1, v15, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 -; SDAG-NEXT: v_cndmask_b32_e32 v27, v16, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v27, v18, v5, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v26, v24, v4, vcc ; SDAG-NEXT: v_mov_b32_e32 v24, 0 ; SDAG-NEXT: v_mov_b32_e32 v25, 0 ; SDAG-NEXT: .LBB3_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v27 +; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v27 ; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v19 -; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v51, 31, v21 +; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v16, v28, v16 +; SDAG-NEXT: v_lshrrev_b32_e32 v51, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_or_b32_e32 v18, v28, v18 ; SDAG-NEXT: v_or_b32_e32 v26, v26, v50 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v51 -; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 -; SDAG-NEXT: v_or_b32_e32 v21, v25, v21 -; SDAG-NEXT: v_sub_i32_e32 v25, vcc, v38, v26 -; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 -; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v39, v27, vcc -; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v48, v16, vcc -; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v49, v29, vcc -; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v25 -; SDAG-NEXT: v_and_b32_e32 v28, v25, v12 -; SDAG-NEXT: v_and_b32_e32 v50, v25, v13 -; SDAG-NEXT: v_and_b32_e32 v51, v25, v14 -; SDAG-NEXT: v_and_b32_e32 v52, v25, v15 -; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v28 -; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v50, vcc -; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v16, v51, vcc -; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v52, vcc +; SDAG-NEXT: v_or_b32_e32 v20, v20, v51 +; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 +; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 +; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v38, v26 +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v39, v27, vcc +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v48, v18, vcc +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v49, v29, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v50, 31, v24 +; SDAG-NEXT: v_and_b32_e32 v24, v50, v12 +; SDAG-NEXT: v_and_b32_e32 v25, v50, v13 +; SDAG-NEXT: v_and_b32_e32 v28, v50, v14 +; SDAG-NEXT: v_and_b32_e32 v51, v50, v15 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v24 +; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v25, vcc +; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v18, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v51, vcc ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v34 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v36, vcc ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc -; SDAG-NEXT: v_or_b32_e32 v50, v34, v36 -; SDAG-NEXT: v_or_b32_e32 v51, v35, v37 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[50:51] -; SDAG-NEXT: v_and_b32_e32 v16, 1, v25 +; SDAG-NEXT: v_or_b32_e32 v24, v34, v36 +; SDAG-NEXT: v_or_b32_e32 v25, v35, v37 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25] +; SDAG-NEXT: v_and_b32_e32 v18, 1, v50 ; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; SDAG-NEXT: v_or_b32_e32 v20, v24, v20 -; SDAG-NEXT: v_mov_b32_e32 v25, v17 -; SDAG-NEXT: v_mov_b32_e32 v24, v16 +; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 +; SDAG-NEXT: v_mov_b32_e32 v25, v19 +; SDAG-NEXT: v_mov_b32_e32 v24, v18 ; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] ; SDAG-NEXT: s_cbranch_execnz .LBB3_9 ; SDAG-NEXT: ; %bb.10: ; %Flow ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB3_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v21 ; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; SDAG-NEXT: v_or_b32_e32 v18, v18, v24 -; SDAG-NEXT: v_or_b32_e32 v23, v23, v19 -; SDAG-NEXT: v_or_b32_e32 v17, v17, v21 -; SDAG-NEXT: v_or_b32_e32 v22, v22, v18 -; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 +; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v24 +; SDAG-NEXT: v_or_b32_e32 v23, v23, v21 +; SDAG-NEXT: v_or_b32_e32 v19, v19, v17 +; SDAG-NEXT: v_or_b32_e32 v22, v22, v20 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v16 ; SDAG-NEXT: .LBB3_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] -; SDAG-NEXT: v_mul_lo_u32 v21, v32, v11 -; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v32, v10, 0 -; SDAG-NEXT: v_mul_lo_u32 v26, v30, v10 +; SDAG-NEXT: v_mul_lo_u32 v20, v32, v11 +; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v32, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v21, v30, v10 ; SDAG-NEXT: v_mul_lo_u32 v27, v33, v8 ; SDAG-NEXT: v_mul_lo_u32 v28, v31, v9 -; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v32, 0 -; SDAG-NEXT: v_mov_b32_e32 v20, 0 -; SDAG-NEXT: v_mul_lo_u32 v29, v16, v15 -; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0 -; SDAG-NEXT: v_mul_lo_u32 v33, v17, v14 -; SDAG-NEXT: v_mul_lo_u32 v34, v23, v12 -; SDAG-NEXT: v_mul_lo_u32 v35, v22, v13 -; SDAG-NEXT: v_add_i32_e32 v21, vcc, v25, v21 -; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v9, v32, v[19:20] -; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v18 +; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v32, 0 +; SDAG-NEXT: v_mov_b32_e32 v26, 0 +; SDAG-NEXT: v_mul_lo_u32 v29, v18, v15 +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v14, 0 +; SDAG-NEXT: v_mul_lo_u32 v33, v19, v14 +; SDAG-NEXT: v_mul_lo_u32 v23, v23, v12 +; SDAG-NEXT: v_mul_lo_u32 v34, v22, v13 +; SDAG-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v9, v32, v[25:26] +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v24 ; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v29 -; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v21, v26 -; SDAG-NEXT: v_mov_b32_e32 v19, v14 -; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v30, v[19:20] +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v17, v21 +; SDAG-NEXT: v_mov_b32_e32 v25, v14 +; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v8, v30, v[25:26] ; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v33 -; SDAG-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v31, v8, v[24:25] -; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v15, v19 +; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v31, v8, v[16:17] +; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v15, v21 ; SDAG-NEXT: v_addc_u32_e64 v15, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v18, vcc +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v20, vcc ; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[10:11] -; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v16, 0 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v27, v24 +; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v12, v18, 0 +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v27, v17 ; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[14:15] -; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v34, v11 -; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v13, v16, v[19:20] -; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v28, v21 -; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v35, v11 -; SDAG-NEXT: v_mov_b32_e32 v19, v14 -; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v17, v[19:20] -; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v23 -; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], v9, v16, s[4:5] +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v23, v11 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v13, v18, v[25:26] +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v28, v17 +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], v34, v11 +; SDAG-NEXT: v_mov_b32_e32 v25, v14 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v19, v[25:26] +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v16 +; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], v9, v17, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc ; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v15, v12 ; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc -; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v17, v[8:9] +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v19, v[8:9] ; SDAG-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v21, vcc -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v18 +; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v18, vcc +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v24 ; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v11, vcc ; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v8, vcc ; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc @@ -2917,11 +2917,11 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 ; GISEL-NEXT: v_cndmask_b32_e64 v32, v0, 0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v20, v2, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v21, v3, 0, vcc ; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v21, v3, 0, vcc +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB3_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 ; GISEL-NEXT: v_add_i32_e32 v30, vcc, 1, v18 @@ -2952,89 +2952,89 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_mov_b32_e32 v18, s10 ; GISEL-NEXT: v_mov_b32_e32 v17, s9 ; GISEL-NEXT: v_mov_b32_e32 v16, s8 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9] ; GISEL-NEXT: s_cbranch_execz .LBB3_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 ; GISEL-NEXT: v_add_i32_e32 v26, vcc, 0xffffffc0, v30 ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v30 ; GISEL-NEXT: v_lshr_b64 v[16:17], v[2:3], v30 ; GISEL-NEXT: v_lshr_b64 v[18:19], v[0:1], v30 -; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_add_i32_e32 v34, vcc, -1, v8 ; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v9, vcc ; GISEL-NEXT: v_lshl_b64 v[24:25], v[2:3], v24 ; GISEL-NEXT: v_lshr_b64 v[26:27], v[2:3], v26 ; GISEL-NEXT: v_addc_u32_e32 v36, vcc, -1, v10, vcc ; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v11, vcc -; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_or_b32_e32 v18, v18, v24 ; GISEL-NEXT: v_or_b32_e32 v19, v19, v25 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 ; GISEL-NEXT: v_cndmask_b32_e32 v18, v26, v18, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v26, 0, v16, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v27, 0, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v28, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v29, 0, v17, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; GISEL-NEXT: v_cndmask_b32_e32 v28, v18, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v29, v19, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v26, v18, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v27, v19, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v25, 0 -; GISEL-NEXT: v_mov_b32_e32 v19, s7 -; GISEL-NEXT: v_mov_b32_e32 v18, s6 -; GISEL-NEXT: v_mov_b32_e32 v17, s5 -; GISEL-NEXT: v_mov_b32_e32 v16, s4 +; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: v_mov_b32_e32 v18, s10 +; GISEL-NEXT: v_mov_b32_e32 v17, s9 +; GISEL-NEXT: v_mov_b32_e32 v16, s8 ; GISEL-NEXT: .LBB3_3: ; %udiv-do-while3 ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_lshrrev_b32_e32 v38, 31, v23 -; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v29 -; GISEL-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 -; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v39, 31, v21 ; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; GISEL-NEXT: v_lshl_b64 v[18:19], v[28:29], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v27 +; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v28, 31, v21 ; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; GISEL-NEXT: v_or_b32_e32 v22, v16, v18 -; GISEL-NEXT: v_or_b32_e32 v23, v17, v19 -; GISEL-NEXT: v_or_b32_e32 v18, v26, v24 -; GISEL-NEXT: v_or_b32_e32 v19, v28, v39 +; GISEL-NEXT: v_lshrrev_b32_e32 v29, 31, v23 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 ; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc ; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v34, v19 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v29, vcc +; GISEL-NEXT: v_or_b32_e32 v18, v18, v24 +; GISEL-NEXT: v_or_b32_e32 v26, v26, v28 +; GISEL-NEXT: v_or_b32_e32 v20, v20, v29 +; GISEL-NEXT: v_or_b32_e32 v22, v16, v22 +; GISEL-NEXT: v_or_b32_e32 v23, v17, v23 ; GISEL-NEXT: v_or_b32_e32 v16, v30, v32 ; GISEL-NEXT: v_or_b32_e32 v17, v31, v33 -; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v36, v18, vcc -; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v37, v27, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] -; GISEL-NEXT: v_ashrrev_i32_e32 v16, 31, v24 -; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v34, v26 +; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v35, v27, vcc +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[16:17] +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v36, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v37, v19, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v16, 31, v16 +; GISEL-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] ; GISEL-NEXT: v_and_b32_e32 v17, v16, v8 -; GISEL-NEXT: v_and_b32_e32 v26, v16, v9 -; GISEL-NEXT: v_and_b32_e32 v39, v16, v10 -; GISEL-NEXT: v_and_b32_e32 v48, v16, v11 +; GISEL-NEXT: v_and_b32_e32 v28, v16, v9 +; GISEL-NEXT: v_and_b32_e32 v29, v16, v10 +; GISEL-NEXT: v_and_b32_e32 v38, v16, v11 ; GISEL-NEXT: v_and_b32_e32 v24, 1, v16 -; GISEL-NEXT: v_sub_i32_e32 v28, vcc, v19, v17 -; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v29, v26, vcc -; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v18, v39, vcc -; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v48, vcc -; GISEL-NEXT: v_or_b32_e32 v20, v20, v38 +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v26, v17 +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v28, vcc +; GISEL-NEXT: v_subb_u32_e32 v28, vcc, v18, v29, vcc +; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v19, v38, vcc ; GISEL-NEXT: v_mov_b32_e32 v16, v24 ; GISEL-NEXT: v_mov_b32_e32 v17, v25 -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execnz .LBB3_3 ; GISEL-NEXT: ; %bb.4: ; %Flow13 -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB3_5: ; %Flow14 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 +; GISEL-NEXT: .LBB3_5: ; %Flow14 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v23 -; GISEL-NEXT: v_or_b32_e32 v20, v20, v22 +; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v23 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 +; GISEL-NEXT: v_or_b32_e32 v20, v20, v24 ; GISEL-NEXT: v_or_b32_e32 v32, v16, v18 ; GISEL-NEXT: v_or_b32_e32 v33, v17, v19 ; GISEL-NEXT: .LBB3_6: ; %Flow16 -; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_or_b32_e32 v16, v12, v14 ; GISEL-NEXT: v_or_b32_e32 v17, v13, v15 @@ -3094,7 +3094,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e64 v19, v7, 0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB3_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 ; GISEL-NEXT: v_add_i32_e32 v34, vcc, 1, v22 @@ -3103,44 +3103,44 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_not_b32_e32 v18, 63 ; GISEL-NEXT: v_addc_u32_e64 v36, vcc, 0, v16, s[4:5] ; GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v17, vcc -; GISEL-NEXT: v_add_i32_e64 v24, s[4:5], v28, v18 -; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], 64, v28 +; GISEL-NEXT: v_add_i32_e64 v22, s[4:5], v28, v18 +; GISEL-NEXT: v_sub_i32_e64 v23, s[4:5], 64, v28 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[4:5], v28 ; GISEL-NEXT: v_lshl_b64 v[18:19], v[6:7], v28 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: v_lshr_b64 v[22:23], v[4:5], v22 -; GISEL-NEXT: v_lshl_b64 v[26:27], v[4:5], v24 +; GISEL-NEXT: v_lshr_b64 v[24:25], v[4:5], v23 +; GISEL-NEXT: v_lshl_b64 v[26:27], v[4:5], v22 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v28 -; GISEL-NEXT: v_cndmask_b32_e32 v24, 0, v16, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v17, vcc -; GISEL-NEXT: v_or_b32_e32 v16, v22, v18 -; GISEL-NEXT: v_or_b32_e32 v17, v23, v19 +; GISEL-NEXT: v_cndmask_b32_e32 v22, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v17, vcc +; GISEL-NEXT: v_or_b32_e32 v16, v24, v18 +; GISEL-NEXT: v_or_b32_e32 v17, v25, v19 ; GISEL-NEXT: v_cndmask_b32_e32 v16, v26, v16, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v17, v27, v17, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v28 -; GISEL-NEXT: v_cndmask_b32_e32 v22, v16, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v23, v17, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v24, v16, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v25, v17, v7, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v19, s11 ; GISEL-NEXT: v_mov_b32_e32 v18, s10 ; GISEL-NEXT: v_mov_b32_e32 v17, s9 ; GISEL-NEXT: v_mov_b32_e32 v16, s8 -; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9] ; GISEL-NEXT: s_cbranch_execz .LBB3_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader ; GISEL-NEXT: v_add_i32_e32 v28, vcc, 0xffffffc0, v34 ; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 64, v34 ; GISEL-NEXT: v_lshr_b64 v[16:17], v[6:7], v34 ; GISEL-NEXT: v_lshr_b64 v[18:19], v[4:5], v34 -; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: v_add_i32_e32 v38, vcc, -1, v12 ; GISEL-NEXT: v_addc_u32_e32 v39, vcc, -1, v13, vcc ; GISEL-NEXT: v_lshl_b64 v[26:27], v[6:7], v26 ; GISEL-NEXT: v_lshr_b64 v[28:29], v[6:7], v28 ; GISEL-NEXT: v_addc_u32_e32 v48, vcc, -1, v14, vcc ; GISEL-NEXT: v_addc_u32_e32 v49, vcc, -1, v15, vcc -; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_or_b32_e32 v18, v18, v26 ; GISEL-NEXT: v_or_b32_e32 v19, v19, v27 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v34 @@ -3152,62 +3152,62 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v28, v18, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v29, v19, v5, vcc ; GISEL-NEXT: v_mov_b32_e32 v27, 0 -; GISEL-NEXT: v_mov_b32_e32 v19, s7 -; GISEL-NEXT: v_mov_b32_e32 v18, s6 -; GISEL-NEXT: v_mov_b32_e32 v17, s5 -; GISEL-NEXT: v_mov_b32_e32 v16, s4 +; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: v_mov_b32_e32 v18, s10 +; GISEL-NEXT: v_mov_b32_e32 v17, s9 +; GISEL-NEXT: v_mov_b32_e32 v16, s8 ; GISEL-NEXT: .LBB3_9: ; %udiv-do-while ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_lshl_b64 v[18:19], v[24:25], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v25 -; GISEL-NEXT: v_lshl_b64 v[50:51], v[28:29], 1 -; GISEL-NEXT: v_lshl_b64 v[30:31], v[30:31], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v28, 31, v29 -; GISEL-NEXT: v_lshrrev_b32_e32 v29, 31, v23 -; GISEL-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 ; GISEL-NEXT: v_add_i32_e32 v34, vcc, -1, v34 ; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc -; GISEL-NEXT: v_or_b32_e32 v24, v16, v18 -; GISEL-NEXT: v_or_b32_e32 v25, v17, v19 -; GISEL-NEXT: v_or_b32_e32 v18, v30, v28 -; GISEL-NEXT: v_or_b32_e32 v19, v50, v29 -; GISEL-NEXT: v_or_b32_e32 v22, v22, v26 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[30:31], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v29 +; GISEL-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v30, 31, v25 +; GISEL-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v31, 31, v23 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 ; GISEL-NEXT: v_addc_u32_e32 v36, vcc, -1, v36, vcc ; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v38, v19 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v39, v51, vcc +; GISEL-NEXT: v_or_b32_e32 v18, v18, v26 +; GISEL-NEXT: v_or_b32_e32 v28, v28, v30 +; GISEL-NEXT: v_or_b32_e32 v24, v24, v31 +; GISEL-NEXT: v_or_b32_e32 v22, v16, v22 +; GISEL-NEXT: v_or_b32_e32 v23, v17, v23 ; GISEL-NEXT: v_or_b32_e32 v16, v34, v36 ; GISEL-NEXT: v_or_b32_e32 v17, v35, v37 -; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v48, v18, vcc -; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v49, v31, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] -; GISEL-NEXT: v_ashrrev_i32_e32 v16, 31, v26 -; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v26, 1, v16 +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v38, v28 +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v39, v29, vcc +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[16:17] +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v48, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v49, v19, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v16, 31, v16 +; GISEL-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] ; GISEL-NEXT: v_and_b32_e32 v17, v16, v12 -; GISEL-NEXT: v_and_b32_e32 v29, v16, v13 -; GISEL-NEXT: v_and_b32_e32 v30, v16, v14 +; GISEL-NEXT: v_and_b32_e32 v30, v16, v13 +; GISEL-NEXT: v_and_b32_e32 v31, v16, v14 ; GISEL-NEXT: v_and_b32_e32 v50, v16, v15 -; GISEL-NEXT: v_sub_i32_e32 v28, vcc, v19, v17 -; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v51, v29, vcc +; GISEL-NEXT: v_and_b32_e32 v26, 1, v16 +; GISEL-NEXT: v_sub_i32_e32 v28, vcc, v28, v17 +; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v29, v30, vcc +; GISEL-NEXT: v_subb_u32_e32 v30, vcc, v18, v31, vcc +; GISEL-NEXT: v_subb_u32_e32 v31, vcc, v19, v50, vcc ; GISEL-NEXT: v_mov_b32_e32 v16, v26 ; GISEL-NEXT: v_mov_b32_e32 v17, v27 -; GISEL-NEXT: v_subb_u32_e32 v30, vcc, v18, v30, vcc -; GISEL-NEXT: v_subb_u32_e32 v31, vcc, v31, v50, vcc -; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execnz .LBB3_9 ; GISEL-NEXT: ; %bb.10: ; %Flow -; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] -; GISEL-NEXT: .LBB3_11: ; %Flow11 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: v_lshl_b64 v[26:27], v[24:25], 1 -; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v25 +; GISEL-NEXT: .LBB3_11: ; %Flow11 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_lshl_b64 v[26:27], v[22:23], 1 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[24:25], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v23 ; GISEL-NEXT: v_or_b32_e32 v18, v18, v22 ; GISEL-NEXT: v_or_b32_e32 v24, v16, v26 ; GISEL-NEXT: v_or_b32_e32 v25, v17, v27 ; GISEL-NEXT: .LBB3_12: ; %Flow12 -; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0 ; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0 ; GISEL-NEXT: v_mul_lo_u32 v34, v8, v21 diff --git a/llvm/test/CodeGen/AMDGPU/ds_permute_a_v.ll b/llvm/test/CodeGen/AMDGPU/ds_permute_a_v.ll index 5cd798d4f6db1..3013ca62d1b65 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_permute_a_v.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_permute_a_v.ll @@ -181,12 +181,81 @@ define i32 @ds_bpermute_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[0:31] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v32, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v33, a1 +; CHECK-NEXT: ds_bpermute_b32 v32, v32, v33 +; CHECK-NEXT: v_accvgpr_write_b32 a31, v50 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a30, v51 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a29, v52 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a28, v53 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a27, v54 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a26, v55 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a25, v56 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a24, v57 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a23, v58 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a22, v59 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a21, v60 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a20, v61 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a19, v62 ; Reload Reuse +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_accvgpr_write_b32 a0, v32 +; CHECK-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; CHECK-NEXT: v_accvgpr_read_b32 v50, a31 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v51, a30 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v52, a29 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v53, a28 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v54, a27 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v55, a26 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v56, a25 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v57, a24 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v58, a23 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v59, a22 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v60, a21 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v61, a20 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v62, a19 ; Reload Reuse +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use v[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: ds_bpermute_b32 v0, v0, v1 ; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse @@ -203,7 +272,6 @@ define i32 @ds_bpermute_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse -; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 10 %gep.1 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 24 @@ -295,12 +363,81 @@ define i32 @ds_permute_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def v[0:31] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v32, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v33, a1 +; CHECK-NEXT: ds_permute_b32 v32, v32, v33 +; CHECK-NEXT: v_accvgpr_write_b32 a31, v50 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a30, v51 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a29, v52 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a28, v53 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a27, v54 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a26, v55 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a25, v56 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a24, v57 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a23, v58 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a22, v59 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a21, v60 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a20, v61 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a19, v62 ; Reload Reuse +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_accvgpr_write_b32 a0, v32 +; CHECK-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; CHECK-NEXT: v_accvgpr_read_b32 v50, a31 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v51, a30 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v52, a29 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v53, a28 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v54, a27 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v55, a26 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v56, a25 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v57, a24 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v58, a23 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v59, a22 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v60, a21 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v61, a20 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v62, a19 ; Reload Reuse +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use v[0:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: ds_permute_b32 v0, v0, v1 ; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse @@ -317,7 +454,6 @@ define i32 @ds_permute_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse ; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse -; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 10 %gep.1 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 24 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll index 683887b0a55f3..7365e53546127 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll @@ -452,76 +452,76 @@ define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def v[0:31] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_nop 0 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: ds_write2_b32 v0, v1, v2 offset0:10 offset1:24 -; GCN-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: v_accvgpr_write_b32 a19, v63 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v32, a0 +; GCN-NEXT: v_accvgpr_read_b32 v33, a1 +; GCN-NEXT: v_accvgpr_read_b32 v34, a2 +; GCN-NEXT: ds_write2_b32 v32, v33, v34 offset0:10 offset1:24 +; GCN-NEXT: v_accvgpr_write_b32 a31, v51 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a30, v52 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a29, v53 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a28, v54 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a27, v55 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a26, v56 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a25, v57 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a24, v58 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a23, v59 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a22, v60 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a21, v61 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a20, v62 ; Reload Reuse +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: v_accvgpr_read_b32 v51, a31 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v52, a30 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v53, a29 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v54, a28 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v55, a27 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v56, a26 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v57, a25 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v58, a24 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v59, a23 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v60, a22 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v61, a21 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v62, a20 ; Reload Reuse ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v63, a19 ; Reload Reuse ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use v[0:31] ; GCN-NEXT: ;;#ASMEND @@ -1001,78 +1001,78 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def v[0:31] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_nop 0 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_accvgpr_write_b32 a21, v31 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24 -; GCN-NEXT: v_accvgpr_write_b32 a31, v21 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a30, v22 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a29, v23 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a28, v24 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a27, v25 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a26, v26 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a25, v27 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a24, v28 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a23, v29 ; Reload Reuse -; GCN-NEXT: v_accvgpr_write_b32 a22, v30 ; Reload Reuse -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: v_accvgpr_read_b32 v21, a31 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v22, a30 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v23, a29 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v24, a28 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v25, a27 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v26, a26 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v27, a25 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v28, a24 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v29, a23 ; Reload Reuse -; GCN-NEXT: v_accvgpr_read_b32 v30, a22 ; Reload Reuse +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_accvgpr_write_b32 a21, v63 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v35, a3 +; GCN-NEXT: v_accvgpr_read_b32 v37, a5 +; GCN-NEXT: v_accvgpr_read_b32 v32, a0 +; GCN-NEXT: v_accvgpr_read_b32 v34, a2 +; GCN-NEXT: v_accvgpr_read_b32 v36, a4 +; GCN-NEXT: ds_write2_b64 v32, v[34:35], v[36:37] offset0:10 offset1:24 +; GCN-NEXT: v_accvgpr_write_b32 a31, v53 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a30, v54 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a29, v55 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a28, v56 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a27, v57 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a26, v58 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a25, v59 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a24, v60 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a23, v61 ; Reload Reuse +; GCN-NEXT: v_accvgpr_write_b32 a22, v62 ; Reload Reuse +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: v_accvgpr_read_b32 v53, a31 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v54, a30 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v55, a29 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v56, a28 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v57, a27 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v58, a26 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v59, a25 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v60, a24 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v61, a23 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v62, a22 ; Reload Reuse ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_accvgpr_read_b32 v31, a21 ; Reload Reuse +; GCN-NEXT: v_accvgpr_read_b32 v63, a21 ; Reload Reuse ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use v[0:31] ; GCN-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll index 1c687734731b1..986adb400a093 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll @@ -296,22 +296,22 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: .LBB3_4: ; %exit ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc ; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] -; GCN-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[8:9] -; GCN-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11] -; GCN-NEXT: v_cmp_gt_i64_e64 s[8:9], 0, v[12:13] -; GCN-NEXT: v_cmp_gt_i64_e64 s[10:11], 0, v[14:15] -; GCN-NEXT: v_cmp_gt_i64_e64 s[12:13], 0, v[16:17] -; GCN-NEXT: v_cmp_gt_i64_e64 s[14:15], 0, v[18:19] -; GCN-NEXT: v_cmp_gt_i64_e64 s[16:17], 0, v[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, s[16:17] ; GCN-NEXT: v_cndmask_b32_e64 v2, v1, -1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v6, v1, -1, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v8, v1, -1, s[8:9] -; GCN-NEXT: v_cndmask_b32_e64 v10, v1, -1, s[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v12, v1, -1, s[12:13] -; GCN-NEXT: v_cndmask_b32_e64 v14, v1, -1, s[14:15] +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9] +; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] +; GCN-NEXT: v_cndmask_b32_e64 v6, v1, -1, vcc +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[12:13] +; GCN-NEXT: v_cndmask_b32_e64 v8, v1, -1, vcc +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] +; GCN-NEXT: v_cndmask_b32_e64 v10, v1, -1, vcc +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[16:17] +; GCN-NEXT: v_cndmask_b32_e64 v12, v1, -1, vcc +; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[18:19] +; GCN-NEXT: v_cndmask_b32_e64 v14, v1, -1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, -1 ; GCN-NEXT: v_mov_b32_e32 v3, -1 ; GCN-NEXT: v_mov_b32_e32 v5, -1 @@ -540,22 +540,22 @@ define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-NEXT: .LBB6_4: ; %exit ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000 +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v1, -2.0, v0, vcc ; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7] -; GCN-NEXT: v_cmp_nlt_f64_e64 s[4:5], -1.0, v[8:9] -; GCN-NEXT: v_cmp_nlt_f64_e64 s[6:7], -1.0, v[10:11] -; GCN-NEXT: v_cmp_nlt_f64_e64 s[8:9], -1.0, v[12:13] -; GCN-NEXT: v_cmp_nlt_f64_e64 s[10:11], -1.0, v[14:15] -; GCN-NEXT: v_cmp_nlt_f64_e64 s[12:13], -1.0, v[16:17] -; GCN-NEXT: v_cmp_nlt_f64_e64 s[14:15], -1.0, v[18:19] -; GCN-NEXT: v_cmp_nlt_f64_e64 s[16:17], -1.0, v[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v1, -2.0, v0, s[16:17] ; GCN-NEXT: v_cndmask_b32_e32 v3, -2.0, v0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v5, -2.0, v0, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v7, -2.0, v0, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v9, -2.0, v0, s[8:9] -; GCN-NEXT: v_cndmask_b32_e64 v11, -2.0, v0, s[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v13, -2.0, v0, s[12:13] -; GCN-NEXT: v_cndmask_b32_e64 v15, -2.0, v0, s[14:15] +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[8:9] +; GCN-NEXT: v_cndmask_b32_e32 v5, -2.0, v0, vcc +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[10:11] +; GCN-NEXT: v_cndmask_b32_e32 v7, -2.0, v0, vcc +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[12:13] +; GCN-NEXT: v_cndmask_b32_e32 v9, -2.0, v0, vcc +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[14:15] +; GCN-NEXT: v_cndmask_b32_e32 v11, -2.0, v0, vcc +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[16:17] +; GCN-NEXT: v_cndmask_b32_e32 v13, -2.0, v0, vcc +; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[18:19] +; GCN-NEXT: v_cndmask_b32_e32 v15, -2.0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index 614200803d6f1..32153084f9c3c 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -730,14 +730,18 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; SI-NEXT: s_load_dword s8, s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; SI-NEXT: v_mov_b32_e32 v9, 0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; SI-NEXT: v_mov_b32_e32 v6, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 -; SI-NEXT: buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[1:4], v[5:6], s[4:7], 0 addr64 +; SI-NEXT: v_lshlrev_b32_e32 v9, 1, v0 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: buffer_load_dwordx4 v[5:8], v[5:6], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_cmp_eq_u32 s8, 1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -753,64 +757,61 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 1, v0 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_cmp_eq_u32 s8, 1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v18 -; SI-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 3 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 4 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 5 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 6 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 7 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 8 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 9 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 10 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 11 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 12 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 13 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 14 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s8, 15 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, v[8:9], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_short v0, v[9:10], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr: diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll index bd1f98a39c252..1634aad79a8f5 100644 --- a/llvm/test/CodeGen/AMDGPU/fceil64.ll +++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll @@ -402,350 +402,348 @@ define amdgpu_kernel void @fceil_v8f64(ptr addrspace(1) %out, <8 x double> %x) { define amdgpu_kernel void @fceil_v16f64(ptr addrspace(1) %out, <16 x double> %x) { ; SI-LABEL: fceil_v16f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x29 -; SI-NEXT: s_mov_b32 s26, -1 -; SI-NEXT: s_mov_b32 s29, 0xfffff -; SI-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x9 -; SI-NEXT: v_mov_b32_e32 v8, 0 -; SI-NEXT: s_mov_b32 s28, s26 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 -; SI-NEXT: s_and_b32 s2, s11, 0x80000000 -; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[28:29], s3 -; SI-NEXT: s_andn2_b64 s[0:1], s[10:11], s[0:1] -; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s2, s1 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s3, s11, s1 -; SI-NEXT: s_cselect_b32 s2, s10, s0 -; SI-NEXT: v_cmp_gt_f64_e64 s[0:1], s[10:11], 0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[10:11], v[0:1] -; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; SI-NEXT: s_cselect_b32 s10, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 -; SI-NEXT: s_and_b32 s6, s9, 0x80000000 -; SI-NEXT: s_add_i32 s7, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[28:29], s7 -; SI-NEXT: s_andn2_b64 s[0:1], s[8:9], s[0:1] -; SI-NEXT: s_cmp_lt_i32 s7, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s6, s1 -; SI-NEXT: s_cmp_gt_i32 s7, 51 -; SI-NEXT: s_cselect_b32 s7, s9, s1 -; SI-NEXT: s_cselect_b32 s6, s8, s0 -; SI-NEXT: v_cmp_gt_f64_e64 s[0:1], s[8:9], 0 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[8:9], v[0:1] -; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; SI-NEXT: s_cselect_b32 s27, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s0, s15, 0xb0014 -; SI-NEXT: s_and_b32 s8, s15, 0x80000000 -; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[28:29], s9 -; SI-NEXT: s_andn2_b64 s[0:1], s[14:15], s[0:1] -; SI-NEXT: s_cmp_lt_i32 s9, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s8, s1 -; SI-NEXT: s_cmp_gt_i32 s9, 51 -; SI-NEXT: s_cselect_b32 s9, s15, s1 -; SI-NEXT: s_cselect_b32 s8, s14, s0 -; SI-NEXT: v_cmp_gt_f64_e64 s[0:1], s[14:15], 0 -; SI-NEXT: v_mov_b32_e32 v9, s10 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[14:15], v[0:1] -; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; SI-NEXT: s_cselect_b32 s14, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s0, s13, 0xb0014 -; SI-NEXT: s_and_b32 s10, s13, 0x80000000 -; SI-NEXT: s_add_i32 s15, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[28:29], s15 -; SI-NEXT: s_andn2_b64 s[0:1], s[12:13], s[0:1] -; SI-NEXT: s_cmp_lt_i32 s15, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s10, s1 -; SI-NEXT: v_cmp_gt_f64_e64 s[10:11], s[12:13], 0 -; SI-NEXT: s_cmp_gt_i32 s15, 51 -; SI-NEXT: s_cselect_b32 s1, s13, s1 -; SI-NEXT: s_cselect_b32 s0, s12, s0 -; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[8:9] -; SI-NEXT: v_mov_b32_e32 v9, s27 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[12:13], v[0:1] -; SI-NEXT: s_and_b64 s[2:3], s[10:11], vcc -; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec -; SI-NEXT: s_cselect_b32 s10, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s2, s19, 0xb0014 -; SI-NEXT: s_and_b32 s11, s19, 0x80000000 -; SI-NEXT: s_add_i32 s12, s2, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[28:29], s12 -; SI-NEXT: s_andn2_b64 s[2:3], s[18:19], s[2:3] -; SI-NEXT: s_cmp_lt_i32 s12, 0 -; SI-NEXT: s_cselect_b32 s13, 0, s2 -; SI-NEXT: s_cselect_b32 s11, s11, s3 -; SI-NEXT: v_cmp_gt_f64_e64 s[2:3], s[18:19], 0 -; SI-NEXT: s_cmp_gt_i32 s12, 51 -; SI-NEXT: s_cselect_b32 s31, s19, s11 -; SI-NEXT: s_cselect_b32 s30, s18, s13 -; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[8:9] -; SI-NEXT: v_mov_b32_e32 v9, s14 -; SI-NEXT: v_mov_b32_e32 v4, s30 -; SI-NEXT: v_mov_b32_e32 v5, s31 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[18:19], v[4:5] -; SI-NEXT: s_and_b64 s[2:3], s[2:3], vcc -; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec -; SI-NEXT: s_cselect_b32 s33, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s2, s17, 0xb0014 -; SI-NEXT: s_and_b32 s6, s17, 0x80000000 -; SI-NEXT: s_add_i32 s7, s2, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[28:29], s7 -; SI-NEXT: s_andn2_b64 s[2:3], s[16:17], s[2:3] -; SI-NEXT: s_cmp_lt_i32 s7, 0 -; SI-NEXT: s_cselect_b32 s11, 0, s2 -; SI-NEXT: s_cselect_b32 s6, s6, s3 -; SI-NEXT: v_cmp_gt_f64_e64 s[2:3], s[16:17], 0 -; SI-NEXT: s_cmp_gt_i32 s7, 51 -; SI-NEXT: s_cselect_b32 s19, s17, s6 -; SI-NEXT: s_cselect_b32 s18, s16, s11 -; SI-NEXT: v_add_f64 v[6:7], s[8:9], v[8:9] -; SI-NEXT: v_mov_b32_e32 v9, s10 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s19 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[16:17], v[4:5] -; SI-NEXT: s_and_b64 s[2:3], s[2:3], vcc -; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec -; SI-NEXT: s_cselect_b32 s36, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s2, s23, 0xb0014 -; SI-NEXT: s_and_b32 s6, s23, 0x80000000 -; SI-NEXT: s_add_i32 s7, s2, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[28:29], s7 -; SI-NEXT: s_andn2_b64 s[2:3], s[22:23], s[2:3] -; SI-NEXT: s_cmp_lt_i32 s7, 0 -; SI-NEXT: s_cselect_b32 s6, s6, s3 -; SI-NEXT: s_cselect_b32 s8, 0, s2 -; SI-NEXT: v_cmp_gt_f64_e64 s[2:3], s[22:23], 0 -; SI-NEXT: s_cmp_gt_i32 s7, 51 -; SI-NEXT: s_cselect_b32 s35, s23, s6 -; SI-NEXT: s_cselect_b32 s34, s22, s8 -; SI-NEXT: v_add_f64 v[4:5], s[0:1], v[8:9] -; SI-NEXT: v_mov_b32_e32 v9, s34 -; SI-NEXT: v_mov_b32_e32 v10, s35 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[22:23], v[9:10] -; SI-NEXT: s_and_b64 s[0:1], s[2:3], vcc -; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; SI-NEXT: s_cselect_b32 s37, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s0, s21, 0xb0014 -; SI-NEXT: s_and_b32 s2, s21, 0x80000000 -; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[28:29], s3 -; SI-NEXT: s_andn2_b64 s[0:1], s[20:21], s[0:1] -; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: s_cselect_b32 s1, s2, s1 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_cselect_b32 s17, s21, s1 -; SI-NEXT: s_cselect_b32 s16, s20, s0 -; SI-NEXT: v_cmp_gt_f64_e64 s[22:23], s[20:21], 0 -; SI-NEXT: v_mov_b32_e32 v9, s16 -; SI-NEXT: v_mov_b32_e32 v10, s17 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[20:21], v[9:10] +; SI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0x29 ; SI-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x39 -; SI-NEXT: s_mov_b32 s27, 0xf000 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:16 -; SI-NEXT: v_mov_b32_e32 v9, s33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[6:7], s[30:31], v[8:9] +; SI-NEXT: s_mov_b32 s39, 0xf000 +; SI-NEXT: s_mov_b32 s38, -1 +; SI-NEXT: s_mov_b32 s35, 0xfffff +; SI-NEXT: v_mov_b32_e32 v12, 0 +; SI-NEXT: s_mov_b32 s34, s38 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_gt_f64_e64 s[20:21], s[2:3], 0 -; SI-NEXT: v_mov_b32_e32 v9, s36 -; SI-NEXT: v_cmp_gt_f64_e64 s[30:31], s[0:1], 0 -; SI-NEXT: v_add_f64 v[4:5], s[18:19], v[8:9] -; SI-NEXT: v_cmp_gt_f64_e64 s[18:19], s[6:7], 0 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:32 -; SI-NEXT: v_mov_b32_e32 v9, s37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[6:7], s[34:35], v[8:9] -; SI-NEXT: v_cmp_gt_f64_e64 s[34:35], s[4:5], 0 -; SI-NEXT: s_and_b64 s[22:23], s[22:23], vcc +; SI-NEXT: s_bfe_u32 s33, s19, 0xb0014 +; SI-NEXT: s_and_b32 s44, s19, 0x80000000 +; SI-NEXT: v_cmp_gt_f64_e64 s[40:41], s[18:19], 0 +; SI-NEXT: s_addk_i32 s33, 0xfc01 +; SI-NEXT: s_lshr_b64 s[42:43], s[34:35], s33 +; SI-NEXT: s_andn2_b64 s[42:43], s[18:19], s[42:43] +; SI-NEXT: s_cmp_lt_i32 s33, 0 +; SI-NEXT: s_cselect_b32 s42, 0, s42 +; SI-NEXT: s_cselect_b32 s43, s44, s43 +; SI-NEXT: s_cmp_gt_i32 s33, 51 +; SI-NEXT: s_cselect_b32 s43, s19, s43 +; SI-NEXT: s_cselect_b32 s42, s18, s42 +; SI-NEXT: v_cmp_gt_f64_e64 s[44:45], s[16:17], 0 +; SI-NEXT: v_mov_b32_e32 v0, s42 +; SI-NEXT: v_mov_b32_e32 v1, s43 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[18:19], v[0:1] +; SI-NEXT: s_and_b64 s[18:19], s[40:41], vcc +; SI-NEXT: v_cmp_gt_f64_e64 s[40:41], s[22:23], 0 +; SI-NEXT: s_and_b64 s[18:19], s[18:19], exec +; SI-NEXT: s_cselect_b32 s18, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s19, s17, 0xb0014 +; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: s_and_b32 s33, s17, 0x80000000 +; SI-NEXT: v_add_f64 v[2:3], s[42:43], v[12:13] +; SI-NEXT: s_add_i32 s42, s19, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[18:19], s[34:35], s42 +; SI-NEXT: s_andn2_b64 s[18:19], s[16:17], s[18:19] +; SI-NEXT: s_cmp_lt_i32 s42, 0 +; SI-NEXT: s_cselect_b32 s18, 0, s18 +; SI-NEXT: s_cselect_b32 s19, s33, s19 +; SI-NEXT: s_cmp_gt_i32 s42, 51 +; SI-NEXT: s_cselect_b32 s43, s17, s19 +; SI-NEXT: s_cselect_b32 s42, s16, s18 +; SI-NEXT: v_cmp_gt_f64_e64 s[18:19], s[20:21], 0 +; SI-NEXT: v_mov_b32_e32 v0, s42 +; SI-NEXT: v_mov_b32_e32 v1, s43 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[16:17], v[0:1] +; SI-NEXT: s_and_b64 s[16:17], s[44:45], vcc +; SI-NEXT: v_cmp_gt_f64_e64 s[44:45], s[26:27], 0 +; SI-NEXT: s_and_b64 s[16:17], s[16:17], exec +; SI-NEXT: s_cselect_b32 s16, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s17, s23, 0xb0014 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: s_and_b32 s33, s23, 0x80000000 +; SI-NEXT: v_add_f64 v[0:1], s[42:43], v[12:13] +; SI-NEXT: s_add_i32 s42, s17, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[16:17], s[34:35], s42 +; SI-NEXT: s_andn2_b64 s[16:17], s[22:23], s[16:17] +; SI-NEXT: s_cmp_lt_i32 s42, 0 +; SI-NEXT: s_cselect_b32 s16, 0, s16 +; SI-NEXT: s_cselect_b32 s17, s33, s17 +; SI-NEXT: s_cmp_gt_i32 s42, 51 +; SI-NEXT: s_cselect_b32 s17, s23, s17 +; SI-NEXT: s_cselect_b32 s16, s22, s16 +; SI-NEXT: v_cmp_gt_f64_e64 s[42:43], s[24:25], 0 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[22:23], v[4:5] +; SI-NEXT: s_and_b64 s[22:23], s[40:41], vcc +; SI-NEXT: v_cmp_gt_f64_e64 s[40:41], s[30:31], 0 ; SI-NEXT: s_and_b64 s[22:23], s[22:23], exec ; SI-NEXT: s_cselect_b32 s22, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s23, s3, 0xb0014 -; SI-NEXT: s_and_b32 s33, s3, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s22 -; SI-NEXT: s_add_i32 s36, s23, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[22:23], s[28:29], s36 -; SI-NEXT: s_andn2_b64 s[22:23], s[2:3], s[22:23] -; SI-NEXT: s_cmp_lt_i32 s36, 0 -; SI-NEXT: s_cselect_b32 s38, 0, s22 -; SI-NEXT: s_cselect_b32 s33, s33, s23 -; SI-NEXT: v_cmp_gt_f64_e64 s[22:23], s[10:11], 0 -; SI-NEXT: s_cmp_gt_i32 s36, 51 -; SI-NEXT: s_cselect_b32 s37, s3, s33 -; SI-NEXT: s_cselect_b32 s36, s2, s38 -; SI-NEXT: v_add_f64 v[4:5], s[16:17], v[8:9] -; SI-NEXT: v_mov_b32_e32 v9, s36 -; SI-NEXT: v_mov_b32_e32 v10, s37 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[2:3], v[9:10] -; SI-NEXT: s_and_b64 s[2:3], s[20:21], vcc +; SI-NEXT: s_bfe_u32 s23, s21, 0xb0014 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: s_and_b32 s22, s21, 0x80000000 +; SI-NEXT: v_add_f64 v[6:7], s[16:17], v[12:13] +; SI-NEXT: s_addk_i32 s23, 0xfc01 +; SI-NEXT: s_lshr_b64 s[16:17], s[34:35], s23 +; SI-NEXT: s_andn2_b64 s[16:17], s[20:21], s[16:17] +; SI-NEXT: s_cmp_lt_i32 s23, 0 +; SI-NEXT: s_cselect_b32 s16, 0, s16 +; SI-NEXT: s_cselect_b32 s17, s22, s17 +; SI-NEXT: s_cmp_gt_i32 s23, 51 +; SI-NEXT: s_cselect_b32 s23, s21, s17 +; SI-NEXT: s_cselect_b32 s22, s20, s16 +; SI-NEXT: v_cmp_gt_f64_e64 s[16:17], s[28:29], 0 +; SI-NEXT: v_mov_b32_e32 v4, s22 +; SI-NEXT: v_mov_b32_e32 v5, s23 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[20:21], v[4:5] +; SI-NEXT: s_and_b64 s[20:21], s[18:19], vcc +; SI-NEXT: v_cmp_gt_f64_e64 s[18:19], s[2:3], 0 +; SI-NEXT: s_and_b64 s[20:21], s[20:21], exec +; SI-NEXT: s_cselect_b32 s20, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s21, s27, 0xb0014 +; SI-NEXT: v_mov_b32_e32 v13, s20 +; SI-NEXT: s_and_b32 s33, s27, 0x80000000 +; SI-NEXT: v_add_f64 v[4:5], s[22:23], v[12:13] +; SI-NEXT: s_add_i32 s22, s21, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[20:21], s[34:35], s22 +; SI-NEXT: s_andn2_b64 s[20:21], s[26:27], s[20:21] +; SI-NEXT: s_cmp_lt_i32 s22, 0 +; SI-NEXT: s_cselect_b32 s20, 0, s20 +; SI-NEXT: s_cselect_b32 s21, s33, s21 +; SI-NEXT: s_cmp_gt_i32 s22, 51 +; SI-NEXT: s_cselect_b32 s47, s27, s21 +; SI-NEXT: s_cselect_b32 s46, s26, s20 +; SI-NEXT: v_cmp_gt_f64_e64 s[20:21], s[0:1], 0 +; SI-NEXT: v_mov_b32_e32 v8, s46 +; SI-NEXT: v_mov_b32_e32 v9, s47 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[26:27], v[8:9] +; SI-NEXT: s_and_b64 s[26:27], s[44:45], vcc +; SI-NEXT: v_cmp_gt_f64_e64 s[22:23], s[6:7], 0 +; SI-NEXT: s_and_b64 s[26:27], s[26:27], exec +; SI-NEXT: s_cselect_b32 s26, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s27, s25, 0xb0014 +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: s_and_b32 s33, s25, 0x80000000 +; SI-NEXT: v_add_f64 v[10:11], s[46:47], v[12:13] +; SI-NEXT: s_add_i32 s44, s27, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[26:27], s[34:35], s44 +; SI-NEXT: s_andn2_b64 s[26:27], s[24:25], s[26:27] +; SI-NEXT: s_cmp_lt_i32 s44, 0 +; SI-NEXT: s_cselect_b32 s26, 0, s26 +; SI-NEXT: s_cselect_b32 s27, s33, s27 +; SI-NEXT: s_cmp_gt_i32 s44, 51 +; SI-NEXT: s_cselect_b32 s45, s25, s27 +; SI-NEXT: s_cselect_b32 s44, s24, s26 +; SI-NEXT: v_cmp_gt_f64_e64 s[26:27], s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v8, s44 +; SI-NEXT: v_mov_b32_e32 v9, s45 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[24:25], v[8:9] +; SI-NEXT: s_and_b64 s[42:43], s[42:43], vcc +; SI-NEXT: v_cmp_gt_f64_e64 s[24:25], s[10:11], 0 +; SI-NEXT: s_and_b64 s[42:43], s[42:43], exec +; SI-NEXT: s_cselect_b32 s33, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s42, s31, 0xb0014 +; SI-NEXT: v_mov_b32_e32 v13, s33 +; SI-NEXT: s_and_b32 s33, s31, 0x80000000 +; SI-NEXT: v_add_f64 v[8:9], s[44:45], v[12:13] +; SI-NEXT: s_add_i32 s44, s42, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[42:43], s[34:35], s44 +; SI-NEXT: s_andn2_b64 s[42:43], s[30:31], s[42:43] +; SI-NEXT: s_cmp_lt_i32 s44, 0 +; SI-NEXT: s_cselect_b32 s42, 0, s42 +; SI-NEXT: s_cselect_b32 s33, s33, s43 +; SI-NEXT: s_cmp_gt_i32 s44, 51 +; SI-NEXT: s_cselect_b32 s45, s31, s33 +; SI-NEXT: s_cselect_b32 s44, s30, s42 +; SI-NEXT: v_cmp_gt_f64_e64 s[42:43], s[8:9], 0 +; SI-NEXT: v_mov_b32_e32 v13, s44 +; SI-NEXT: v_mov_b32_e32 v14, s45 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[30:31], v[13:14] +; SI-NEXT: s_and_b64 s[40:41], s[40:41], vcc +; SI-NEXT: v_cmp_gt_f64_e64 s[30:31], s[14:15], 0 +; SI-NEXT: s_and_b64 s[40:41], s[40:41], exec +; SI-NEXT: s_cselect_b32 s33, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s40, s29, 0xb0014 +; SI-NEXT: v_mov_b32_e32 v13, s33 +; SI-NEXT: s_and_b32 s33, s29, 0x80000000 +; SI-NEXT: v_add_f64 v[15:16], s[44:45], v[12:13] +; SI-NEXT: s_add_i32 s44, s40, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[40:41], s[34:35], s44 +; SI-NEXT: s_andn2_b64 s[40:41], s[28:29], s[40:41] +; SI-NEXT: s_cmp_lt_i32 s44, 0 +; SI-NEXT: s_cselect_b32 s46, 0, s40 +; SI-NEXT: s_cselect_b32 s33, s33, s41 +; SI-NEXT: s_cmp_gt_i32 s44, 51 +; SI-NEXT: v_cmp_gt_f64_e64 s[40:41], s[12:13], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:16 +; SI-NEXT: s_cselect_b32 s45, s29, s33 +; SI-NEXT: s_cselect_b32 s44, s28, s46 +; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v4, s44 +; SI-NEXT: v_mov_b32_e32 v5, s45 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[28:29], v[4:5] +; SI-NEXT: s_and_b64 s[16:17], s[16:17], vcc +; SI-NEXT: s_and_b64 s[16:17], s[16:17], exec +; SI-NEXT: s_cselect_b32 s16, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s17, s3, 0xb0014 +; SI-NEXT: s_and_b32 s28, s3, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_add_f64 v[13:14], s[44:45], v[12:13] +; SI-NEXT: s_add_i32 s29, s17, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[16:17], s[34:35], s29 +; SI-NEXT: s_andn2_b64 s[16:17], s[2:3], s[16:17] +; SI-NEXT: s_cmp_lt_i32 s29, 0 +; SI-NEXT: s_cselect_b32 s16, 0, s16 +; SI-NEXT: s_cselect_b32 s17, s28, s17 +; SI-NEXT: s_cmp_gt_i32 s29, 51 +; SI-NEXT: s_cselect_b32 s17, s3, s17 +; SI-NEXT: s_cselect_b32 s16, s2, s16 +; SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[36:39], 0 offset:48 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[2:3], v[4:5] +; SI-NEXT: s_and_b64 s[2:3], s[18:19], vcc ; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; SI-NEXT: s_bfe_u32 s3, s1, 0xb0014 -; SI-NEXT: s_and_b32 s16, s1, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s2 -; SI-NEXT: s_add_i32 s17, s3, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[28:29], s17 -; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3] -; SI-NEXT: s_cmp_lt_i32 s17, 0 -; SI-NEXT: s_cselect_b32 s20, 0, s2 -; SI-NEXT: s_cselect_b32 s16, s16, s3 -; SI-NEXT: v_cmp_gt_f64_e64 s[2:3], s[8:9], 0 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:48 -; SI-NEXT: s_cmp_gt_i32 s17, 51 -; SI-NEXT: s_cselect_b32 s17, s1, s16 -; SI-NEXT: s_cselect_b32 s16, s0, s20 +; SI-NEXT: s_and_b32 s18, s1, 0x80000000 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[6:7], s[36:37], v[8:9] -; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v13, s2 +; SI-NEXT: s_add_i32 s19, s3, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[2:3], s[34:35], s19 +; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3] +; SI-NEXT: s_cmp_lt_i32 s19, 0 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_cselect_b32 s3, s18, s3 +; SI-NEXT: s_cmp_gt_i32 s19, 51 +; SI-NEXT: s_cselect_b32 s3, s1, s3 +; SI-NEXT: s_cselect_b32 s2, s0, s2 +; SI-NEXT: v_add_f64 v[6:7], s[16:17], v[12:13] +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_mov_b32_e32 v4, s2 ; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[0:1], v[4:5] -; SI-NEXT: s_and_b64 s[0:1], s[30:31], vcc +; SI-NEXT: s_and_b64 s[0:1], s[20:21], vcc ; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; SI-NEXT: s_bfe_u32 s1, s7, 0xb0014 -; SI-NEXT: s_and_b32 s20, s7, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s0 -; SI-NEXT: s_add_i32 s21, s1, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[28:29], s21 +; SI-NEXT: s_and_b32 s16, s7, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v13, s0 +; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[12:13] +; SI-NEXT: s_add_i32 s2, s1, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[34:35], s2 ; SI-NEXT: s_andn2_b64 s[0:1], s[6:7], s[0:1] -; SI-NEXT: s_cmp_lt_i32 s21, 0 -; SI-NEXT: s_cselect_b32 s30, 0, s0 -; SI-NEXT: s_cselect_b32 s20, s20, s1 -; SI-NEXT: v_cmp_gt_f64_e64 s[0:1], s[14:15], 0 -; SI-NEXT: s_cmp_gt_i32 s21, 51 -; SI-NEXT: s_cselect_b32 s21, s7, s20 -; SI-NEXT: s_cselect_b32 s20, s6, s30 -; SI-NEXT: v_add_f64 v[4:5], s[16:17], v[8:9] -; SI-NEXT: v_mov_b32_e32 v9, s20 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[6:7], v[9:10] -; SI-NEXT: s_and_b64 s[6:7], s[18:19], vcc -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s7, s5, 0xb0014 -; SI-NEXT: s_and_b32 s16, s5, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s6 -; SI-NEXT: s_add_i32 s17, s7, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[28:29], s17 -; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] -; SI-NEXT: s_cmp_lt_i32 s17, 0 -; SI-NEXT: s_cselect_b32 s18, 0, s6 -; SI-NEXT: s_cselect_b32 s16, s16, s7 -; SI-NEXT: v_cmp_gt_f64_e64 s[6:7], s[12:13], 0 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:64 -; SI-NEXT: s_cmp_gt_i32 s17, 51 -; SI-NEXT: s_cselect_b32 s17, s5, s16 -; SI-NEXT: s_cselect_b32 s16, s4, s18 +; SI-NEXT: s_cmp_lt_i32 s2, 0 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cselect_b32 s1, s16, s1 +; SI-NEXT: s_cmp_gt_i32 s2, 51 +; SI-NEXT: s_cselect_b32 s1, s7, s1 +; SI-NEXT: s_cselect_b32 s0, s6, s0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:64 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[6:7], s[20:21], v[8:9] -; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v5, s1 +; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[6:7], v[4:5] +; SI-NEXT: s_and_b64 s[2:3], s[22:23], vcc +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 +; SI-NEXT: s_and_b32 s6, s5, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v13, s2 +; SI-NEXT: s_add_i32 s7, s3, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[2:3], s[34:35], s7 +; SI-NEXT: s_andn2_b64 s[2:3], s[4:5], s[2:3] +; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_cselect_b32 s3, s6, s3 +; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cselect_b32 s3, s5, s3 +; SI-NEXT: s_cselect_b32 s2, s4, s2 +; SI-NEXT: v_add_f64 v[6:7], s[0:1], v[12:13] +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_mov_b32_e32 v4, s2 ; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[4:5], v[4:5] -; SI-NEXT: s_and_b64 s[4:5], s[34:35], vcc -; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s5, s11, 0xb0014 -; SI-NEXT: s_and_b32 s18, s11, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: v_add_f64 v[4:5], s[16:17], v[8:9] -; SI-NEXT: s_add_i32 s16, s5, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], s16 -; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] -; SI-NEXT: s_cmp_lt_i32 s16, 0 -; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s18, s5 -; SI-NEXT: s_cmp_gt_i32 s16, 51 -; SI-NEXT: s_cselect_b32 s5, s11, s5 -; SI-NEXT: s_cselect_b32 s4, s10, s4 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:80 +; SI-NEXT: s_and_b64 s[0:1], s[26:27], vcc +; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s1, s11, 0xb0014 +; SI-NEXT: s_and_b32 s4, s11, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v13, s0 +; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[12:13] +; SI-NEXT: s_add_i32 s2, s1, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[34:35], s2 +; SI-NEXT: s_andn2_b64 s[0:1], s[10:11], s[0:1] +; SI-NEXT: s_cmp_lt_i32 s2, 0 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cselect_b32 s1, s4, s1 +; SI-NEXT: s_cmp_gt_i32 s2, 51 +; SI-NEXT: s_cselect_b32 s1, s11, s1 +; SI-NEXT: s_cselect_b32 s0, s10, s0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:80 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v5, s1 +; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[10:11], v[4:5] -; SI-NEXT: s_and_b64 s[10:11], s[22:23], vcc -; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec -; SI-NEXT: s_cselect_b32 s10, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s11, s9, 0xb0014 -; SI-NEXT: s_and_b32 s16, s9, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s10 -; SI-NEXT: s_add_i32 s17, s11, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[10:11], s[28:29], s17 -; SI-NEXT: s_andn2_b64 s[10:11], s[8:9], s[10:11] -; SI-NEXT: s_cmp_lt_i32 s17, 0 -; SI-NEXT: s_cselect_b32 s10, 0, s10 -; SI-NEXT: s_cselect_b32 s11, s16, s11 -; SI-NEXT: s_cmp_gt_i32 s17, 51 -; SI-NEXT: s_cselect_b32 s11, s9, s11 -; SI-NEXT: s_cselect_b32 s10, s8, s10 -; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[8:9] -; SI-NEXT: v_mov_b32_e32 v4, s10 -; SI-NEXT: v_mov_b32_e32 v5, s11 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[8:9], v[4:5] -; SI-NEXT: s_and_b64 s[2:3], s[2:3], vcc +; SI-NEXT: s_and_b64 s[2:3], s[24:25], vcc ; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s3, s15, 0xb0014 -; SI-NEXT: s_and_b32 s4, s15, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s2 +; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 +; SI-NEXT: s_and_b32 s4, s9, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v13, s2 ; SI-NEXT: s_add_i32 s5, s3, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[28:29], s5 -; SI-NEXT: s_andn2_b64 s[2:3], s[14:15], s[2:3] +; SI-NEXT: s_lshr_b64 s[2:3], s[34:35], s5 +; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[2:3] ; SI-NEXT: s_cmp_lt_i32 s5, 0 ; SI-NEXT: s_cselect_b32 s2, 0, s2 ; SI-NEXT: s_cselect_b32 s3, s4, s3 ; SI-NEXT: s_cmp_gt_i32 s5, 51 -; SI-NEXT: s_cselect_b32 s3, s15, s3 -; SI-NEXT: s_cselect_b32 s2, s14, s2 -; SI-NEXT: v_add_f64 v[4:5], s[10:11], v[8:9] -; SI-NEXT: v_mov_b32_e32 v10, s3 -; SI-NEXT: v_mov_b32_e32 v9, s2 -; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[14:15], v[9:10] -; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc +; SI-NEXT: s_cselect_b32 s3, s9, s3 +; SI-NEXT: s_cselect_b32 s2, s8, s2 +; SI-NEXT: v_add_f64 v[6:7], s[0:1], v[12:13] +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_mov_b32_e32 v4, s2 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[8:9], v[4:5] +; SI-NEXT: s_and_b64 s[0:1], s[42:43], vcc ; SI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; SI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; SI-NEXT: s_bfe_u32 s1, s15, 0xb0014 +; SI-NEXT: s_and_b32 s4, s15, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v13, s0 +; SI-NEXT: s_add_i32 s5, s1, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[34:35], s5 +; SI-NEXT: s_andn2_b64 s[0:1], s[14:15], s[0:1] +; SI-NEXT: s_cmp_lt_i32 s5, 0 +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cselect_b32 s1, s4, s1 +; SI-NEXT: s_cmp_gt_i32 s5, 51 +; SI-NEXT: s_cselect_b32 s1, s15, s1 +; SI-NEXT: s_cselect_b32 s0, s14, s0 +; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[12:13] +; SI-NEXT: v_mov_b32_e32 v9, s1 +; SI-NEXT: v_mov_b32_e32 v8, s0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[14:15], v[8:9] +; SI-NEXT: s_and_b64 s[2:3], s[30:31], vcc +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: s_bfe_u32 s0, s13, 0xb0014 +; SI-NEXT: s_bfe_u32 s2, s13, 0xb0014 ; SI-NEXT: s_and_b32 s5, s13, 0x80000000 -; SI-NEXT: s_add_i32 s8, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[28:29], s8 -; SI-NEXT: s_andn2_b64 s[0:1], s[12:13], s[0:1] -; SI-NEXT: s_cmp_lt_i32 s8, 0 -; SI-NEXT: s_cselect_b32 s0, 0, s0 -; SI-NEXT: s_cselect_b32 s1, s5, s1 -; SI-NEXT: s_cmp_gt_i32 s8, 51 -; SI-NEXT: s_cselect_b32 s1, s13, s1 -; SI-NEXT: s_cselect_b32 s0, s12, s0 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:96 -; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[2:3], s[34:35], s6 +; SI-NEXT: s_andn2_b64 s[2:3], s[12:13], s[2:3] +; SI-NEXT: s_cmp_lt_i32 s6, 0 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_cselect_b32 s3, s5, s3 +; SI-NEXT: s_cmp_gt_i32 s6, 51 +; SI-NEXT: s_cselect_b32 s3, s13, s3 +; SI-NEXT: s_cselect_b32 s2, s12, s2 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:96 +; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_mov_b32_e32 v4, s2 ; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[12:13], v[4:5] -; SI-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SI-NEXT: s_and_b64 s[4:5], s[40:41], vcc ; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_add_f64 v[6:7], s[2:3], v[8:9] -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: v_add_f64 v[4:5], s[0:1], v[8:9] -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:112 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[24:27], 0 +; SI-NEXT: v_add_f64 v[6:7], s[0:1], v[12:13] +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[12:13] +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:112 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 ; SI-NEXT: s_endpgm %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone store <16 x double> %y, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index 308e86bbaf8fd..0b21ff692663b 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -2468,17 +2468,17 @@ define void @freeze_v17i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v18, vcc, 64, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GFX8-GISEL-NEXT: flat_load_dword v20, v[18:19] +; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 64, v0 ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: flat_load_dword v20, v[0:1] ; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) @@ -2487,13 +2487,15 @@ define void @freeze_v17i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 48, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 64, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(3) +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[16:19] +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) ; GFX8-GISEL-NEXT: flat_store_dword v[2:3], v20 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3469,43 +3471,43 @@ define void @freeze_v21i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 16, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, 0x50 -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, v0, v6 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GFX8-GISEL-NEXT: flat_load_dword v26, v[8:9] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0x50 +; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v14 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 64, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, 16, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 +; GFX8-GISEL-NEXT: flat_load_dword v24, v[24:25] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX8-GISEL-NEXT: s_nop 0 +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 48, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] ; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 0x50, v2 +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: v_add_u32_e64 v8, s[4:5], 64, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[16:19] -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[8:9], v[20:23] -; GFX8-GISEL-NEXT: flat_store_dword v[6:7], v26 +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX8-GISEL-NEXT: flat_store_dword v[6:7], v24 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -4091,13 +4093,13 @@ define void @freeze_v30i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: v_add_u32_e64 v34, s[4:5], 64, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e64 v35, s[4:5], 0, v3, s[4:5] ; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v36, vcc, 0x60, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v33, s[4:5], 0, v3, s[4:5] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GFX8-GISEL-NEXT: v_addc_u32_e64 v33, s[4:5], 0, v3, s[4:5] -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 0x60, v2 ; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0x70, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v3, s[4:5] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[12:15] @@ -4108,7 +4110,7 @@ define void @freeze_v30i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[32:33], v[24:27] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[8:9], v[28:31] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[36:37], v[28:31] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -4419,9 +4421,9 @@ define void @freeze_v31i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 64, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v1, vcc @@ -4448,27 +4450,28 @@ define void @freeze_v31i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 48, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, v2, v35 +; GFX8-GISEL-NEXT: v_add_u32_e64 v35, s[4:5], 64, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v36, s[4:5], 0, v3, s[4:5] +; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, v2, v35 -; GFX8-GISEL-NEXT: v_add_u32_e64 v8, s[4:5], 64, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], 0, v3, s[4:5] +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 0x60, v2 ; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 0x60, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0x70, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], 0, v3, s[4:5] +; GFX8-GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[16:19] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[8:9], v[20:23] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[35:36], v[20:23] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[6:7], v[24:27] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[28:31] +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[8:9], v[28:31] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[32:34] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -4772,13 +4775,13 @@ define void @freeze_v32i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 64, v0 ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 64, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v38, 0x50 -; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v38 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v36, 0x50 +; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v36 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0x60 ; GFX8-GISEL-NEXT: v_add_u32_e32 v28, vcc, v0, v14 @@ -4792,25 +4795,25 @@ define void @freeze_v32i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v36, vcc, 16, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[36:37], v[4:7] -; GFX8-GISEL-NEXT: s_nop 0 +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 48, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, v2, v38 +; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, v2, v36 ; GFX8-GISEL-NEXT: v_add_u32_e64 v8, s[4:5], 64, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5] ; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 0x60, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0x70, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], 0, v3, s[4:5] +; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0x70, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v3, s[4:5] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] @@ -8838,13 +8841,13 @@ define void @freeze_v16p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 64, v0 ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 64, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v38, 0x50 -; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v38 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v36, 0x50 +; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v36 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0x60 ; GFX8-GISEL-NEXT: v_add_u32_e32 v28, vcc, v0, v14 @@ -8858,25 +8861,25 @@ define void @freeze_v16p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v36, vcc, 16, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[36:37], v[4:7] -; GFX8-GISEL-NEXT: s_nop 0 +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 48, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, v2, v38 +; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, v2, v36 ; GFX8-GISEL-NEXT: v_add_u32_e64 v8, s[4:5], 64, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5] ; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 0x60, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0x70, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], 0, v3, s[4:5] +; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0x70, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v3, s[4:5] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] @@ -9886,13 +9889,13 @@ define void @freeze_v16p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 32, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 48, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 64, v0 ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v20, vcc, 64, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v38, 0x50 -; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v38 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v36, 0x50 +; GFX8-GISEL-NEXT: v_add_u32_e32 v24, vcc, v0, v36 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0x60 ; GFX8-GISEL-NEXT: v_add_u32_e32 v28, vcc, v0, v14 @@ -9906,25 +9909,25 @@ define void @freeze_v16p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GFX8-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v36, vcc, 16, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 16, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX8-GISEL-NEXT: flat_store_dwordx4 v[36:37], v[4:7] -; GFX8-GISEL-NEXT: s_nop 0 +; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-GISEL-NEXT: v_add_u32_e32 v0, vcc, 32, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v4, vcc, 48, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, v2, v38 +; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, v2, v36 ; GFX8-GISEL-NEXT: v_add_u32_e64 v8, s[4:5], 64, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5] ; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 0x60, v2 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0x70, v2 ; GFX8-GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], 0, v3, s[4:5] +; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0x70, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v3, s[4:5] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(7) ; GFX8-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[12:15] diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 5babe9fb3d851..8f7dc7a16529f 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -1497,43 +1497,43 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 { ; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; CI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; CI-NEXT: v_and_b32_e32 v9, 0xff, v14 -; CI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; CI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; CI-NEXT: v_or_b32_e32 v12, v12, v13 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: v_lshlrev_b32_e32 v1, 24, v15 -; CI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; CI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; CI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; CI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; CI-NEXT: v_lshlrev_b32_e32 v13, 8, v29 ; CI-NEXT: v_and_b32_e32 v14, 0xff, v28 +; CI-NEXT: v_lshlrev_b32_e32 v15, 24, v27 ; CI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; CI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; CI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; CI-NEXT: v_and_b32_e32 v27, 0xff, v30 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_or_b32_e32 v1, v1, v9 -; CI-NEXT: v_or_b32_e32 v9, v11, v10 -; CI-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; CI-NEXT: v_and_b32_e32 v9, 0xffff, v12 ; CI-NEXT: v_or_b32_e32 v6, v7, v6 ; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; CI-NEXT: v_lshlrev_b32_e32 v15, 24, v27 -; CI-NEXT: v_and_b32_e32 v27, 0xff, v30 ; CI-NEXT: v_or_b32_e32 v13, v14, v13 ; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 ; CI-NEXT: v_or_b32_e32 v7, v3, v2 -; CI-NEXT: v_or_b32_e32 v3, v10, v1 +; CI-NEXT: v_or_b32_e32 v3, v9, v1 ; CI-NEXT: v_or_b32_e32 v1, v4, v6 -; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; CI-NEXT: v_or_b32_e32 v11, v15, v14 +; CI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; CI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; CI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; CI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; CI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; CI-NEXT: v_or_b32_e32 v0, v0, v7 -; CI-NEXT: v_or_b32_e32 v2, v8, v9 +; CI-NEXT: v_or_b32_e32 v10, v11, v10 +; CI-NEXT: v_or_b32_e32 v11, v15, v14 +; CI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; CI-NEXT: v_or_b32_e32 v2, v8, v10 ; CI-NEXT: v_and_b32_e32 v8, 0xff, v20 ; CI-NEXT: v_and_b32_e32 v9, 0xff, v16 ; CI-NEXT: s_mov_b64 s[4:5], 16 @@ -1541,12 +1541,12 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 { ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; CI-NEXT: v_or_b32_e32 v5, v24, v25 -; CI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; CI-NEXT: v_or_b32_e32 v4, v4, v26 -; CI-NEXT: v_or_b32_e32 v6, v5, v11 -; CI-NEXT: v_and_b32_e32 v5, 0xff, v22 ; CI-NEXT: v_or_b32_e32 v7, v12, v4 +; CI-NEXT: v_or_b32_e32 v4, v24, v25 +; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; CI-NEXT: v_and_b32_e32 v5, 0xff, v22 +; CI-NEXT: v_or_b32_e32 v6, v4, v11 ; CI-NEXT: v_lshlrev_b32_e32 v4, 24, v23 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; CI-NEXT: v_or_b32_e32 v4, v4, v5 @@ -2835,9 +2835,9 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 ; CIGFX89: ; %bb.0: ; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; CIGFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 ; CIGFX89-NEXT: s_mov_b32 s6, -1 ; CIGFX89-NEXT: s_waitcnt vmcnt(3) @@ -3172,8 +3172,6 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 @@ -3181,6 +3179,8 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 ; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3199,12 +3199,12 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v10, v38 +; CI-NEXT: v_mul_f32_e32 v8, 1.0, v36 ; CI-NEXT: v_mul_f32_e32 v4, 1.0, v32 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v38 ; CI-NEXT: v_mul_f32_e32 v5, 1.0, v33 ; CI-NEXT: v_mul_f32_e32 v6, 1.0, v34 ; CI-NEXT: v_mul_f32_e32 v7, 1.0, v35 -; CI-NEXT: v_mul_f32_e32 v8, 1.0, v36 ; CI-NEXT: v_mul_f32_e32 v9, 1.0, v37 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3241,11 +3241,11 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; GFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; GFX89-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 ; GFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; GFX89-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; GFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 -; GFX89-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_waitcnt vmcnt(5) @@ -3267,10 +3267,10 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: buffer_store_dword v34, off, s[4:7], 0 ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: buffer_store_dword v35, off, s[4:7], 0 -; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: buffer_store_dword v36, off, s[4:7], 0 ; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_store_dword v35, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: buffer_store_dwordx2 v[32:33], off, s[4:7], 0 ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: s_setpc_b64 s[30:31] @@ -3488,7 +3488,6 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 ; CIGFX89: ; %bb.0: ; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CIGFX89-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 ; CIGFX89-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 ; CIGFX89-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; CIGFX89-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 @@ -3496,6 +3495,7 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 ; CIGFX89-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 ; CIGFX89-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 ; CIGFX89-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; CIGFX89-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 ; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 ; CIGFX89-NEXT: s_mov_b32 s6, -1 ; CIGFX89-NEXT: s_waitcnt vmcnt(8) @@ -3571,8 +3571,6 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 ; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 @@ -3580,6 +3578,8 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 ; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 ; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3620,8 +3620,6 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 @@ -3629,6 +3627,8 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 ; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3669,8 +3669,6 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 @@ -3678,6 +3676,8 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 ; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 ; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3783,11 +3783,11 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 ; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 ; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 ; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 ; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3856,11 +3856,11 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 ; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3929,11 +3929,11 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 ; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4278,8 +4278,6 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 ; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 @@ -4287,6 +4285,8 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 ; CI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 ; CI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4351,8 +4351,6 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:64 @@ -4360,6 +4358,8 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; VI-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:56 ; VI-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ubyte v38, off, s[0:3], s32 offset:40 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4424,8 +4424,6 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_ubyte v33, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:64 @@ -4433,6 +4431,8 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; GFX9-NEXT: buffer_load_ubyte v36, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ubyte v37, off, s[0:3], s32 offset:36 ; GFX9-NEXT: buffer_load_ubyte v38, off, s[0:3], s32 offset:40 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index 0084d936ec03b..9c3d379f8608a 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -1335,48 +1335,48 @@ define <33 x i32> @v33i32_func_void() #0 { ; CI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; CI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; CI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; CI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; CI-NEXT: s_waitcnt vmcnt(11) -; CI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 -; CI-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; CI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; CI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 -; CI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 -; CI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; CI-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; CI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; CI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; CI-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; CI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; CI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen ; CI-NEXT: s_waitcnt vmcnt(14) -; CI-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; CI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 -; CI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; CI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; CI-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 -; CI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 -; CI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 -; CI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; CI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; CI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 +; CI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; CI-NEXT: buffer_store_dword v10, v7, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; CI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 +; CI-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; CI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 ; CI-NEXT: s_waitcnt vmcnt(14) -; CI-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v16, v6, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v20, v8, s[0:3], 0 offen ; CI-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v17, v7, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v24, v10, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v22, v9, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 ; CI-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 @@ -1425,48 +1425,48 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x74, v0 ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x6c, v0 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x64, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x68, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; GFX8-NEXT: s_waitcnt vmcnt(11) -; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 -; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 -; GFX8-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x58, v0 -; GFX8-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x50, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0 +; GFX8-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x60, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x5c, v0 +; GFX8-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x58, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x54, v0 +; GFX8-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x4c, v0 +; GFX8-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x48, v0 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x44, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 60, v0 -; GFX8-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 52, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 48, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 44, v0 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 40, v0 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 36, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x48, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x44, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 64, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 60, v0 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 52, v0 +; GFX8-NEXT: buffer_store_dword v10, v7, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 48, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 44, v0 +; GFX8-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 36, v0 ; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v16, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v20, v8, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v17, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v24, v10, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v22, v9, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0 ; GFX8-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0 @@ -1495,16 +1495,18 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 ; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 ; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 ; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 -; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 ; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 ; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 ; GFX9-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:116 @@ -1525,8 +1527,6 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:52 @@ -1618,48 +1618,48 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; CI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; CI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; CI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; CI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; CI-NEXT: s_waitcnt vmcnt(11) -; CI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 -; CI-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; CI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; CI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 -; CI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 -; CI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; CI-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; CI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; CI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; CI-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; CI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; CI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen ; CI-NEXT: s_waitcnt vmcnt(14) -; CI-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; CI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 -; CI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; CI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; CI-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 -; CI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 -; CI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 -; CI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; CI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; CI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 +; CI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; CI-NEXT: buffer_store_dword v10, v7, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; CI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 +; CI-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; CI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 ; CI-NEXT: s_waitcnt vmcnt(14) -; CI-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v16, v6, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v20, v8, s[0:3], 0 offen ; CI-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v17, v7, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v24, v10, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v22, v9, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 ; CI-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 @@ -1708,48 +1708,48 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x74, v0 ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x6c, v0 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x64, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x68, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; GFX8-NEXT: s_waitcnt vmcnt(11) -; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 -; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 -; GFX8-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x58, v0 -; GFX8-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x50, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0 +; GFX8-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x60, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x5c, v0 +; GFX8-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x58, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x54, v0 +; GFX8-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x4c, v0 +; GFX8-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x48, v0 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x44, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 60, v0 -; GFX8-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 52, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 48, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 44, v0 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 40, v0 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 36, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x48, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x44, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 64, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 60, v0 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 52, v0 +; GFX8-NEXT: buffer_store_dword v10, v7, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 48, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 44, v0 +; GFX8-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 36, v0 ; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v16, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v20, v8, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v17, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v24, v10, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v22, v9, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0 ; GFX8-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0 @@ -1778,16 +1778,18 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 ; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 ; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 ; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 -; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 ; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 ; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 ; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 ; GFX9-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:116 @@ -1808,8 +1810,6 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:56 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:52 @@ -1900,50 +1900,50 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; CI-NEXT: v_add_i32_e32 v3, vcc, 0xf4, v0 ; CI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0xec, v0 ; CI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0xec, v0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0 -; CI-NEXT: v_add_i32_e32 v3, vcc, 0xe4, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, 0xe8, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 0xe4, v0 ; CI-NEXT: s_waitcnt vmcnt(11) -; CI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0xe0, v0 -; CI-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v2, vcc, 0xdc, v0 -; CI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v3, vcc, 0xd8, v0 -; CI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v1, vcc, 0xd4, v0 -; CI-NEXT: v_add_i32_e32 v4, vcc, 0xd0, v0 -; CI-NEXT: v_add_i32_e32 v5, vcc, 0xcc, v0 -; CI-NEXT: v_add_i32_e32 v6, vcc, 0xc8, v0 +; CI-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0xe0, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 0xdc, v0 +; CI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0xd8, v0 +; CI-NEXT: v_add_i32_e32 v7, vcc, 0xd4, v0 +; CI-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0xd0, v0 +; CI-NEXT: v_add_i32_e32 v6, vcc, 0xcc, v0 +; CI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0xc8, v0 ; CI-NEXT: s_waitcnt vmcnt(14) -; CI-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 0xb8, v0 -; CI-NEXT: v_add_i32_e32 v7, vcc, 0xc4, v0 -; CI-NEXT: v_add_i32_e32 v2, vcc, 0xc0, v0 -; CI-NEXT: v_add_i32_e32 v3, vcc, 0xbc, v0 -; CI-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v4, vcc, 0xb4, v0 -; CI-NEXT: v_add_i32_e32 v8, vcc, 0xb0, v0 -; CI-NEXT: v_add_i32_e32 v9, vcc, 0xac, v0 -; CI-NEXT: v_add_i32_e32 v10, vcc, 0xa8, v0 -; CI-NEXT: v_add_i32_e32 v11, vcc, 0xa4, v0 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0xc4, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 0xc0, v0 +; CI-NEXT: v_add_i32_e32 v8, vcc, 0xbc, v0 +; CI-NEXT: v_add_i32_e32 v11, vcc, 0xb4, v0 +; CI-NEXT: buffer_store_dword v10, v7, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v7, vcc, 0xb0, v0 +; CI-NEXT: v_add_i32_e32 v10, vcc, 0xac, v0 +; CI-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0xa8, v0 +; CI-NEXT: v_add_i32_e32 v9, vcc, 0xa4, v0 ; CI-NEXT: s_waitcnt vmcnt(14) -; CI-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v5, vcc, 0xa0, v0 -; CI-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v16, v6, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v6, vcc, 0xa0, v0 +; CI-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v20, v8, s[0:3], 0 offen ; CI-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen -; CI-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v17, v7, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v24, v10, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v22, v9, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v21, v6, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 0x9c, v0 ; CI-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v1, vcc, 0x98, v0 @@ -1990,50 +1990,50 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xf4, v0 ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xf0, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xec, v0 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xec, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xe8, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xe4, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe8, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xe4, v0 ; GFX8-NEXT: s_waitcnt vmcnt(11) -; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe0, v0 -; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xdc, v0 -; GFX8-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xd8, v0 -; GFX8-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xd4, v0 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xd0, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xcc, v0 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xc8, v0 +; GFX8-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xe0, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xdc, v0 +; GFX8-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xd8, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xd4, v0 +; GFX8-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xd0, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xcc, v0 +; GFX8-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xc8, v0 ; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xb8, v0 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xc4, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xc0, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xbc, v0 -; GFX8-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xb4, v0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xb0, v0 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xac, v0 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xa8, v0 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xa4, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xc4, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xc0, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xbc, v0 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xb4, v0 +; GFX8-NEXT: buffer_store_dword v10, v7, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xb0, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xac, v0 +; GFX8-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xa8, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xa4, v0 ; GFX8-NEXT: s_waitcnt vmcnt(14) -; GFX8-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xa0, v0 -; GFX8-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v16, v6, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xa0, v0 +; GFX8-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v13, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v20, v8, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen -; GFX8-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v17, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v24, v10, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v22, v9, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v21, v6, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x9c, v0 ; GFX8-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x98, v0 @@ -2061,16 +2061,18 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240 ; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224 ; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208 ; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192 -; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176 ; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160 ; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144 ; GFX9-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 offset:128 ; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:252 ; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:248 ; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:244 @@ -2091,8 +2093,6 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:196 ; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:192 ; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:188 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:184 ; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:180 @@ -2630,25 +2630,25 @@ define <32 x bfloat> @v32bf16_func_void() #0 { ; CI-NEXT: v_mov_b32_e32 v9, v1 ; CI-NEXT: v_mov_b32_e32 v10, v2 ; CI-NEXT: v_mov_b32_e32 v11, v3 +; CI-NEXT: v_mov_b32_e32 v12, v4 +; CI-NEXT: v_mov_b32_e32 v13, v5 +; CI-NEXT: v_mov_b32_e32 v14, v6 +; CI-NEXT: v_mov_b32_e32 v15, v7 ; CI-NEXT: v_mov_b32_e32 v16, v0 ; CI-NEXT: v_mov_b32_e32 v17, v1 ; CI-NEXT: v_mov_b32_e32 v18, v2 ; CI-NEXT: v_mov_b32_e32 v19, v3 +; CI-NEXT: v_mov_b32_e32 v20, v4 +; CI-NEXT: v_mov_b32_e32 v21, v5 +; CI-NEXT: v_mov_b32_e32 v22, v6 +; CI-NEXT: v_mov_b32_e32 v23, v7 ; CI-NEXT: v_mov_b32_e32 v24, v0 ; CI-NEXT: v_mov_b32_e32 v25, v1 ; CI-NEXT: v_mov_b32_e32 v26, v2 ; CI-NEXT: v_mov_b32_e32 v27, v3 -; CI-NEXT: v_mov_b32_e32 v12, v4 -; CI-NEXT: v_mov_b32_e32 v20, v4 ; CI-NEXT: v_mov_b32_e32 v28, v4 -; CI-NEXT: v_mov_b32_e32 v13, v5 -; CI-NEXT: v_mov_b32_e32 v21, v5 ; CI-NEXT: v_mov_b32_e32 v29, v5 -; CI-NEXT: v_mov_b32_e32 v14, v6 -; CI-NEXT: v_mov_b32_e32 v22, v6 ; CI-NEXT: v_mov_b32_e32 v30, v6 -; CI-NEXT: v_mov_b32_e32 v15, v7 -; CI-NEXT: v_mov_b32_e32 v23, v7 ; CI-NEXT: v_mov_b32_e32 v31, v7 ; CI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 234eaa8af7edf..1f7fdaa21f4c3 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -5734,48 +5734,48 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX9-NEXT: v_mov_b32_e32 v18, v33 ; GFX9-NEXT: v_mov_b32_e32 v19, v34 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v9 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 ; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v27 -; GFX9-NEXT: v_or_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 -; GFX9-NEXT: v_or_b32_sdwa v4, v26, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v23 -; GFX9-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v9 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v17 ; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v19 ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dwordx4 v[42:43], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[40:41], v[6:9], off +; GFX9-NEXT: global_store_dwordx4 v[40:41], v[9:12], off ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index ba81446a4bc09..769bec6b670d5 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -2518,12 +2518,15 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-LABEL: return_72xi32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:212 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:208 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:204 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:200 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:196 +; GFX11-NEXT: s_clause 0xf ; 64-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:224 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:220 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:216 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:212 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:208 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:204 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:200 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:196 ; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:192 ; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:188 ; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:184 @@ -2532,10 +2535,18 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:168 ; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:164 -; GFX11-NEXT: s_clause 0x11 +; GFX11-NEXT: v_dual_mov_b32 v47, v31 :: v_dual_mov_b32 v46, v30 +; GFX11-NEXT: v_mov_b32_e32 v45, v29 +; GFX11-NEXT: scratch_store_b128 off, v[25:28], s32 offset:228 ; 16-byte Folded Spill +; GFX11-NEXT: v_dual_mov_b32 v28, v8 :: v_dual_mov_b32 v27, v7 +; GFX11-NEXT: v_dual_mov_b32 v26, v6 :: v_dual_mov_b32 v25, v5 +; GFX11-NEXT: v_dual_mov_b32 v32, v4 :: v_dual_mov_b32 v31, v3 +; GFX11-NEXT: v_dual_mov_b32 v30, v2 :: v_dual_mov_b32 v29, v1 +; GFX11-NEXT: s_clause 0x1b ; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:16 ; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:12 ; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32 ; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:28 ; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:24 @@ -2551,64 +2562,66 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:96 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:92 ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v8, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v7, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v6, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v4, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:140 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v22, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:124 -; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:120 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-NEXT: s_clause 0x10 -; GFX11-NEXT: scratch_load_b32 v15, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v14, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v13, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v16, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v20, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:84 +; GFX11-NEXT: s_clause 0xb +; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v22, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v20, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v1, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v5, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:100 ; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:68 ; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:52 ; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:36 ; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 -; GFX11-NEXT: s_waitcnt vmcnt(10) -; GFX11-NEXT: scratch_store_b128 v0, v[60:63], off offset:272 -; GFX11-NEXT: s_waitcnt vmcnt(9) -; GFX11-NEXT: scratch_store_b128 v0, v[12:15], off offset:256 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-NEXT: scratch_load_b32 v12, off, s32 +; GFX11-NEXT: v_dual_mov_b32 v9, v45 :: v_dual_mov_b32 v10, v46 +; GFX11-NEXT: v_mov_b32_e32 v11, v47 ; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: scratch_store_b128 v0, v[16:19], off offset:240 +; GFX11-NEXT: scratch_store_b128 v0, v[20:23], off offset:272 ; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: scratch_store_b128 v0, v[20:23], off offset:224 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:256 ; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: scratch_store_b128 v0, v[56:59], off offset:208 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:240 ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: scratch_store_b128 v0, v[41:44], off offset:192 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[60:63], off offset:224 +; GFX11-NEXT: scratch_store_b128 v0, v[56:59], off offset:208 ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: scratch_store_b128 v0, v[37:40], off offset:176 +; GFX11-NEXT: scratch_store_b128 v0, v[41:44], off offset:192 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:160 +; GFX11-NEXT: scratch_store_b128 v0, v[37:40], off offset:176 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:144 +; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:160 ; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:144 ; GFX11-NEXT: scratch_store_b128 v0, v[33:36], off offset:128 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 -; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Reload +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:112 +; GFX11-NEXT: scratch_load_b128 v[1:4], off, s32 offset:228 ; 16-byte Folded Reload +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:16 +; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off +; GFX11-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:164 ; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:168 ; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:172 @@ -2617,11 +2630,14 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:184 ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:188 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:192 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:196 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:200 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:204 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:208 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:196 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:200 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:204 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:208 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:212 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:220 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:224 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ret <72 x i32> %val @@ -2775,39 +2791,39 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:788 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:792 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:796 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:516 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:516 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:520 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:524 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:528 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:532 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:536 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:540 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:544 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:548 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:552 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:556 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:560 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:564 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:528 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:568 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:532 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:572 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:536 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:576 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:540 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:580 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:544 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:548 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:552 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:556 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:560 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:564 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:568 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:572 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:576 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:580 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:584 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:588 ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:592 ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:596 @@ -2863,13 +2879,20 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-NEXT: v_mov_b32_e32 v5, v15 +; GFX9-NEXT: v_mov_b32_e32 v6, v16 +; GFX9-NEXT: v_mov_b32_e32 v7, v17 +; GFX9-NEXT: v_mov_b32_e32 v8, v18 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: v_add_u32_e32 v0, 0x400, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 @@ -3008,8 +3031,8 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: v_writelane_b32 v63, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX10-NEXT: s_clause 0x28 -; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:636 +; GFX10-NEXT: s_clause 0x3a +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:640 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:644 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:648 @@ -3049,31 +3072,14 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:788 ; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:792 ; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:796 -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:516 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:520 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:524 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:528 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:532 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:536 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:540 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:544 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill -; GFX10-NEXT: s_clause 0x15 +; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:516 +; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:520 +; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:524 +; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:528 +; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:532 +; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:536 +; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:540 +; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:544 ; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:548 ; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:552 ; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:556 @@ -3085,20 +3091,37 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:580 ; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:584 ; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:588 -; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:592 -; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:596 -; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:600 -; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:604 -; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:608 -; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:612 -; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:616 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:592 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:596 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:600 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:604 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:608 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:612 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:616 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill +; GFX10-NEXT: v_mov_b32_e32 v20, 24 +; GFX10-NEXT: s_clause 0x3 ; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:620 ; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:624 ; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:628 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:632 -; GFX10-NEXT: v_mov_b32_e32 v0, 24 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 -; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:12 @@ -3138,15 +3161,22 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; GFX10-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; GFX10-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 +; GFX10-NEXT: v_mov_b32_e32 v2, v27 +; GFX10-NEXT: v_mov_b32_e32 v3, v21 +; GFX10-NEXT: v_mov_b32_e32 v4, v22 +; GFX10-NEXT: v_mov_b32_e32 v5, v23 +; GFX10-NEXT: v_mov_b32_e32 v6, v24 +; GFX10-NEXT: v_mov_b32_e32 v7, v25 +; GFX10-NEXT: v_mov_b32_e32 v8, v26 ; GFX10-NEXT: s_clause 0x7 ; 32-byte Folded Reload -; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1536 -; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1540 -; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1544 -; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:1548 -; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:1552 -; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:1556 -; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:1560 -; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:1564 +; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:1564 +; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:1560 +; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:1556 +; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:1552 +; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:1548 +; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:1544 +; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:1540 +; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:1536 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x400, v0 @@ -3187,7 +3217,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v60, s33 offset:1600 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v63, s33 offset:1584 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -3199,19 +3229,22 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_mov_b32 s36, s34 ; GFX11-NEXT: s_mov_b32 s34, s32 ; GFX11-NEXT: s_addk_i32 s32, 0xa00 -; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Spill -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v47, s33 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v56, s33 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v57, s33 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v58, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v59, s33 +; GFX11-NEXT: s_clause 0xe ; 60-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v47, s33 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v56, s33 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v57, s33 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v58, s33 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v59, s33 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v60, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v61, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v62, s33 ; GFX11-NEXT: s_add_i32 s0, s32, 0xa0 ; GFX11-NEXT: s_add_i32 s1, s32, 0x90 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 @@ -3232,7 +3265,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_add_i32 s0, s32, 32 ; GFX11-NEXT: s_add_i32 s1, s32, 16 ; GFX11-NEXT: s_add_i32 s2, s33, 0x200 -; GFX11-NEXT: v_writelane_b32 v60, s30, 0 +; GFX11-NEXT: v_writelane_b32 v63, s30, 0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, 0 @@ -3253,124 +3286,126 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0 ; GFX11-NEXT: s_mov_b32 s1, return_72xi32@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, return_72xi32@abs32@lo -; GFX11-NEXT: v_writelane_b32 v60, s31, 1 +; GFX11-NEXT: v_writelane_b32 v63, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b128 v[45:48], off, s33 offset:624 ; GFX11-NEXT: scratch_load_b128 v[33:36], off, s33 offset:640 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, s33 offset:736 +; GFX11-NEXT: scratch_load_b128 v[59:62], off, s33 offset:512 ; GFX11-NEXT: s_add_i32 s2, s32, 0xa0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_dual_mov_b32 v41, v48 :: v_dual_mov_b32 v42, v33 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_mov_b32_e32 v32, v48 -; GFX11-NEXT: s_clause 0x9 -; GFX11-NEXT: scratch_load_b128 v[48:51], off, s33 offset:656 -; GFX11-NEXT: scratch_load_b128 v[52:55], off, s33 offset:672 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s33 offset:1568 ; 16-byte Folded Spill +; GFX11-NEXT: v_dual_mov_b32 v43, v34 :: v_dual_mov_b32 v44, v35 +; GFX11-NEXT: s_clause 0xc +; GFX11-NEXT: scratch_load_b128 v[56:59], off, s33 offset:784 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, s33 offset:768 +; GFX11-NEXT: scratch_load_b128 v[4:7], off, s33 offset:752 +; GFX11-NEXT: scratch_load_b128 v[8:11], off, s33 offset:720 +; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:704 ; GFX11-NEXT: scratch_load_b128 v[37:40], off, s33 offset:688 -; GFX11-NEXT: scratch_load_b128 v[41:44], off, s33 offset:704 -; GFX11-NEXT: scratch_load_b128 v[56:59], off, s33 offset:720 -; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:736 -; GFX11-NEXT: scratch_load_b128 v[0:3], off, s33 offset:752 -; GFX11-NEXT: scratch_load_b128 v[4:7], off, s33 offset:768 -; GFX11-NEXT: scratch_load_b128 v[8:11], off, s33 offset:784 -; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:512 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v1, v4 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[16:19], s33 offset:1584 ; 16-byte Folded Spill -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b128 v[48:51], off, s33 offset:672 +; GFX11-NEXT: scratch_load_b128 v[32:35], off, s33 offset:656 ; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:528 ; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:544 ; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:560 ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:576 -; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v7, v10 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_mov_b32_e32 v10, v21 +; GFX11-NEXT: scratch_load_b128 v[52:55], off, s33 offset:592 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1568 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:592 +; GFX11-NEXT: scratch_store_b128 off, v[52:55], s33 offset:1552 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_load_b128 v[52:55], off, s33 offset:608 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1552 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:608 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1536 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_store_b128 off, v[32:35], s32 -; GFX11-NEXT: v_mov_b32_e32 v32, v36 -; GFX11-NEXT: v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49 -; GFX11-NEXT: v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v48, v51 -; GFX11-NEXT: v_dual_mov_b32 v49, v52 :: v_dual_mov_b32 v50, v53 -; GFX11-NEXT: v_dual_mov_b32 v51, v54 :: v_dual_mov_b32 v36, v55 -; GFX11-NEXT: v_dual_mov_b32 v53, v41 :: v_dual_mov_b32 v52, v40 -; GFX11-NEXT: v_dual_mov_b32 v54, v42 :: v_dual_mov_b32 v41, v56 -; GFX11-NEXT: v_dual_mov_b32 v55, v43 :: v_dual_mov_b32 v40, v44 -; GFX11-NEXT: v_dual_mov_b32 v42, v57 :: v_dual_mov_b32 v57, v12 -; GFX11-NEXT: v_dual_mov_b32 v43, v58 :: v_dual_mov_b32 v56, v59 -; GFX11-NEXT: v_mov_b32_e32 v58, v13 -; GFX11-NEXT: v_dual_mov_b32 v12, v15 :: v_dual_mov_b32 v13, v0 -; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v0, v3 -; GFX11-NEXT: v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v6 -; GFX11-NEXT: v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v6, v9 -; GFX11-NEXT: v_mov_b32_e32 v9, v20 -; GFX11-NEXT: scratch_store_b32 off, v11, s2 +; GFX11-NEXT: scratch_store_b128 off, v[52:55], s33 offset:1536 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[41:44], s32 +; GFX11-NEXT: v_dual_mov_b32 v41, v3 :: v_dual_mov_b32 v42, v56 +; GFX11-NEXT: v_dual_mov_b32 v43, v57 :: v_dual_mov_b32 v44, v58 +; GFX11-NEXT: scratch_store_b32 off, v59, s2 ; GFX11-NEXT: s_add_i32 s2, s32, 0x90 -; GFX11-NEXT: v_mov_b32_e32 v11, v22 -; GFX11-NEXT: scratch_store_b128 off, v[4:7], s2 +; GFX11-NEXT: scratch_store_b128 off, v[41:44], s2 +; GFX11-NEXT: v_dual_mov_b32 v41, v7 :: v_dual_mov_b32 v42, v0 +; GFX11-NEXT: v_dual_mov_b32 v43, v1 :: v_dual_mov_b32 v44, v2 ; GFX11-NEXT: s_add_i32 s2, s32, 0x80 -; GFX11-NEXT: v_mov_b32_e32 v5, v16 -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 -; GFX11-NEXT: v_mov_b32_e32 v0, 24 +; GFX11-NEXT: v_dual_mov_b32 v7, v18 :: v_dual_mov_b32 v18, v29 +; GFX11-NEXT: v_mov_b32_e32 v29, v45 +; GFX11-NEXT: scratch_store_b128 off, v[41:44], s2 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, s33 offset:1568 ; 16-byte Folded Reload +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, v5 ; GFX11-NEXT: s_add_i32 s2, s32, 0x70 -; GFX11-NEXT: v_mov_b32_e32 v6, v17 -; GFX11-NEXT: scratch_store_b128 off, v[12:15], s2 +; GFX11-NEXT: v_dual_mov_b32 v5, v16 :: v_dual_mov_b32 v16, v27 +; GFX11-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_mov_b32 v52, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: v_dual_mov_b32 v3, v6 :: v_dual_mov_b32 v6, v17 +; GFX11-NEXT: v_dual_mov_b32 v17, v28 :: v_dual_mov_b32 v4, v62 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v1, v52 ; GFX11-NEXT: s_add_i32 s2, s32, 0x6c -; GFX11-NEXT: v_mov_b32_e32 v7, v18 +; GFX11-NEXT: v_dual_mov_b32 v2, v53 :: v_dual_mov_b32 v3, v10 ; GFX11-NEXT: scratch_store_b32 off, v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, v11 ; GFX11-NEXT: s_add_i32 s2, s32, 0x60 -; GFX11-NEXT: v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v15, v26 -; GFX11-NEXT: scratch_store_b96 off, v[56:58], s2 +; GFX11-NEXT: v_dual_mov_b32 v10, v21 :: v_dual_mov_b32 v11, v22 +; GFX11-NEXT: scratch_store_b96 off, v[0:2], s2 +; GFX11-NEXT: v_dual_mov_b32 v0, v15 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_mov_b32_e32 v2, v9 ; GFX11-NEXT: s_add_i32 s2, s32, 0x50 -; GFX11-NEXT: v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45 -; GFX11-NEXT: scratch_store_b128 off, v[40:43], s2 +; GFX11-NEXT: v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v15, v26 +; GFX11-NEXT: v_mov_b32_e32 v9, v20 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 +; GFX11-NEXT: v_mov_b32_e32 v0, v40 +; GFX11-NEXT: v_dual_mov_b32 v1, v12 :: v_dual_mov_b32 v2, v13 +; GFX11-NEXT: v_mov_b32_e32 v3, v14 ; GFX11-NEXT: s_add_i32 s2, s32, 64 -; GFX11-NEXT: v_mov_b32_e32 v13, v24 -; GFX11-NEXT: scratch_store_b128 off, v[52:55], s2 -; GFX11-NEXT: s_add_i32 s2, s32, 48 +; GFX11-NEXT: v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v13, v24 ; GFX11-NEXT: v_mov_b32_e32 v14, v25 -; GFX11-NEXT: scratch_store_b128 off, v[36:39], s2 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 +; GFX11-NEXT: v_dual_mov_b32 v0, v51 :: v_dual_mov_b32 v1, v37 +; GFX11-NEXT: v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39 +; GFX11-NEXT: s_add_i32 s2, s32, 48 +; GFX11-NEXT: v_dual_mov_b32 v19, v30 :: v_dual_mov_b32 v20, v31 +; GFX11-NEXT: v_mov_b32_e32 v30, v46 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 +; GFX11-NEXT: v_dual_mov_b32 v0, v35 :: v_dual_mov_b32 v3, v50 +; GFX11-NEXT: v_dual_mov_b32 v1, v48 :: v_dual_mov_b32 v2, v49 ; GFX11-NEXT: s_add_i32 s2, s32, 32 -; GFX11-NEXT: v_mov_b32_e32 v16, v27 -; GFX11-NEXT: scratch_store_b128 off, v[48:51], s2 +; GFX11-NEXT: v_mov_b32_e32 v31, v47 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 +; GFX11-NEXT: v_mov_b32_e32 v0, v36 +; GFX11-NEXT: v_dual_mov_b32 v1, v32 :: v_dual_mov_b32 v2, v33 +; GFX11-NEXT: v_mov_b32_e32 v3, v34 ; GFX11-NEXT: s_add_i32 s2, s32, 16 -; GFX11-NEXT: v_mov_b32_e32 v30, v46 -; GFX11-NEXT: scratch_store_b128 off, v[32:35], s2 -; GFX11-NEXT: s_clause 0x3 ; 64-byte Folded Reload -; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1568 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 +; GFX11-NEXT: s_clause 0x1 ; 32-byte Folded Reload ; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1552 ; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1536 -; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584 ; GFX11-NEXT: s_add_i32 s2, s33, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, 42 +; GFX11-NEXT: v_dual_mov_b32 v2, v60 :: v_dual_mov_b32 v3, v61 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 42 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Reload -; GFX11-NEXT: scratch_load_b32 v59, off, s33 -; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:4 -; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:8 -; GFX11-NEXT: scratch_load_b32 v56, off, s33 offset:12 -; GFX11-NEXT: scratch_load_b32 v47, off, s33 offset:16 -; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:20 -; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:24 -; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:28 -; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:32 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:36 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:40 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:44 -; GFX11-NEXT: v_readlane_b32 s31, v60, 1 -; GFX11-NEXT: v_readlane_b32 s30, v60, 0 +; GFX11-NEXT: s_clause 0xe ; 60-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v62, off, s33 +; GFX11-NEXT: scratch_load_b32 v61, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v60, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v59, off, s33 offset:12 +; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:16 +; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:20 +; GFX11-NEXT: scratch_load_b32 v56, off, s33 offset:24 +; GFX11-NEXT: scratch_load_b32 v47, off, s33 offset:28 +; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:32 +; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:36 +; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:40 +; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:44 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:48 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:52 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:56 +; GFX11-NEXT: v_readlane_b32 s31, v63, 1 +; GFX11-NEXT: v_readlane_b32 s30, v63, 0 ; GFX11-NEXT: s_mov_b32 s32, s34 ; GFX11-NEXT: s_mov_b32 s34, s36 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v60, off, s33 offset:1600 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v63, off, s33 offset:1584 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s33, s35 ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 8e427a6ef2023..a3c329f34e2de 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -1477,57 +1477,57 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 -; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: s_addc_u32 s5, s3, 0 ; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v14, s3 -; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: v_mov_b32_e32 v22, s3 +; CI-NEXT: v_mov_b32_e32 v21, s2 ; CI-NEXT: s_add_u32 s2, s0, 48 +; CI-NEXT: v_mov_b32_e32 v20, s1 ; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v19, s0 +; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v24, s3 +; CI-NEXT: v_mov_b32_e32 v23, s2 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v0 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v6 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v1 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; CI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v2 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; CI-NEXT: flat_store_dwordx4 v[21:22], v[15:18] +; CI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v17 ; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_mov_b32_e32 v17, s1 -; CI-NEXT: v_mov_b32_e32 v14, s2 -; CI-NEXT: v_mov_b32_e32 v16, s0 -; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; CI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; CI-NEXT: flat_store_dwordx4 v[16:17], v[6:9] +; CI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[23:24], v[12:15] +; CI-NEXT: flat_store_dwordx4 v[4:5], v[8:11] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v16f16_to_v16f32: @@ -2033,43 +2033,43 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v7, s3 -; CI-NEXT: v_mov_b32_e32 v6, s2 +; CI-NEXT: v_mov_b32_e32 v9, s3 +; CI-NEXT: v_mov_b32_e32 v8, s2 ; CI-NEXT: s_add_u32 s2, s0, 32 -; CI-NEXT: v_mov_b32_e32 v13, s1 +; CI-NEXT: v_mov_b32_e32 v15, s1 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v12, s0 +; CI-NEXT: v_mov_b32_e32 v14, s0 ; CI-NEXT: s_add_u32 s0, s0, 16 -; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_mov_b32_e32 v17, s3 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_mov_b32_e32 v16, s2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v5 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v18, v11 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; CI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; CI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v1 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 -; CI-NEXT: v_mov_b32_e32 v17, s1 -; CI-NEXT: v_mov_b32_e32 v16, s0 -; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: flat_store_dwordx4 v[8:9], v[2:5] +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v18 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v19 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: flat_store_dwordx4 v[16:17], v[10:13] +; CI-NEXT: flat_store_dwordx4 v[4:5], v[6:9] +; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v8f16_to_v8f64: @@ -2084,39 +2084,39 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v8, s3 -; VI-NEXT: v_mov_b32_e32 v7, s2 +; VI-NEXT: v_mov_b32_e32 v9, s3 +; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 -; VI-NEXT: v_mov_b32_e32 v13, s1 +; VI-NEXT: v_mov_b32_e32 v15, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v12, s0 +; VI-NEXT: v_mov_b32_e32 v14, s0 ; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: v_mov_b32_e32 v15, s3 +; VI-NEXT: v_mov_b32_e32 v17, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v14, s2 +; VI-NEXT: v_mov_b32_e32 v16, s2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; VI-NEXT: v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v11, v2 -; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 -; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 -; VI-NEXT: flat_store_dwordx4 v[7:8], v[3:6] -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v11 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 -; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v17 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 -; VI-NEXT: v_mov_b32_e32 v17, s1 -; VI-NEXT: v_mov_b32_e32 v16, s0 -; VI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; VI-NEXT: flat_store_dwordx4 v[16:17], v[4:7] -; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; VI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v3 +; VI-NEXT: v_cvt_f32_f16_sdwa v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v6, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v5 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[2:5] +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v19 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v18 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[16:17], v[10:13] +; VI-NEXT: flat_store_dwordx4 v[4:5], v[6:9] +; VI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: global_extload_v8f16_to_v8f64: @@ -2228,66 +2228,67 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v5 ; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: s_nop 0 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: v_mov_b32_e32 v14, s2 -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v1 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; CI-NEXT: flat_store_dwordx4 v[18:19], v[8:11] ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v1 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_mov_b32_e32 v14, s2 ; CI-NEXT: s_add_u32 s2, s0, 0x60 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v6 +; CI-NEXT: v_mov_b32_e32 v21, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v5 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; CI-NEXT: v_mov_b32_e32 v17, s3 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v20 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 -; CI-NEXT: v_mov_b32_e32 v16, s2 +; CI-NEXT: v_mov_b32_e32 v20, s2 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v11 ; CI-NEXT: s_add_u32 s2, s0, 0x50 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v10 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v9 +; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v8 ; CI-NEXT: s_add_u32 s0, s0, 64 -; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v18 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 -; CI-NEXT: v_mov_b32_e32 v19, s3 -; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v18, s2 -; CI-NEXT: v_mov_b32_e32 v12, s0 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; CI-NEXT: flat_store_dwordx4 v[14:15], v[2:5] +; CI-NEXT: v_mov_b32_e32 v17, s3 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v19 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v16, s2 +; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: flat_store_dwordx4 v[20:21], v[10:13] +; CI-NEXT: flat_store_dwordx4 v[16:17], v[6:9] +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v16f16_to_v16f64: @@ -2318,9 +2319,19 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: v_mov_b32_e32 v18, s3 ; VI-NEXT: v_mov_b32_e32 v17, s2 ; VI-NEXT: s_add_u32 s2, s0, 0x50 -; VI-NEXT: v_mov_b32_e32 v12, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v20, s3 +; VI-NEXT: v_mov_b32_e32 v12, s1 +; VI-NEXT: v_mov_b32_e32 v19, s2 +; VI-NEXT: s_add_u32 s2, s0, 64 ; VI-NEXT: v_mov_b32_e32 v11, s0 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v22, s3 +; VI-NEXT: v_mov_b32_e32 v21, s2 +; VI-NEXT: s_add_u32 s2, s0, 0x70 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_add_u32 s0, s0, 0x60 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -2331,53 +2342,45 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 ; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; VI-NEXT: v_mov_b32_e32 v14, s3 +; VI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v10, v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; VI-NEXT: v_mov_b32_e32 v13, s2 -; VI-NEXT: s_add_u32 s2, s0, 64 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] -; VI-NEXT: v_mov_b32_e32 v16, s3 +; VI-NEXT: s_nop 0 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 ; VI-NEXT: v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v4 -; VI-NEXT: v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 -; VI-NEXT: v_mov_b32_e32 v15, s2 -; VI-NEXT: s_add_u32 s2, s0, 0x70 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] -; VI-NEXT: v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v8 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v9 -; VI-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7] -; VI-NEXT: v_cvt_f32_f16_sdwa v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v6 +; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 +; VI-NEXT: v_cvt_f32_f16_sdwa v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v15, s3 +; VI-NEXT: v_mov_b32_e32 v14, s2 +; VI-NEXT: flat_store_dwordx4 v[17:18], v[5:8] +; VI-NEXT: v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 -; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v9 -; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 -; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v10 -; VI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 -; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 -; VI-NEXT: s_add_u32 s0, s0, 0x60 -; VI-NEXT: flat_store_dwordx4 v[13:14], v[1:4] -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 -; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 -; VI-NEXT: v_mov_b32_e32 v20, s3 -; VI-NEXT: v_mov_b32_e32 v14, s1 -; VI-NEXT: v_mov_b32_e32 v19, s2 -; VI-NEXT: v_mov_b32_e32 v13, s0 -; VI-NEXT: flat_store_dwordx4 v[15:16], v[9:12] -; VI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] -; VI-NEXT: flat_store_dwordx4 v[13:14], v[5:8] +; VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v9 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; VI-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v6 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; VI-NEXT: flat_store_dwordx4 v[11:12], v[2:5] +; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v13 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v17 +; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v9 +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; VI-NEXT: flat_store_dwordx4 v[19:20], v[2:5] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dwordx4 v[21:22], v[10:13] +; VI-NEXT: flat_store_dwordx4 v[14:15], v[6:9] +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: global_extload_v16f16_to_v16f64: @@ -2955,51 +2958,52 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; CI-NEXT: v_mov_b32_e32 v13, s3 -; CI-NEXT: v_mov_b32_e32 v12, s2 ; CI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; CI-NEXT: v_mov_b32_e32 v12, s2 ; CI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v17, s3 +; CI-NEXT: v_mov_b32_e32 v16, s2 ; CI-NEXT: s_waitcnt vmcnt(3) ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_mov_b32_e32 v5, s3 ; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v2, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; CI-NEXT: v_or_b32_e32 v0, v0, v18 ; CI-NEXT: v_or_b32_e32 v3, v6, v2 -; CI-NEXT: v_or_b32_e32 v2, v17, v7 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 -; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: v_or_b32_e32 v2, v4, v5 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 +; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; CI-NEXT: s_nop 0 +; CI-NEXT: v_or_b32_e32 v1, v10, v4 +; CI-NEXT: v_or_b32_e32 v0, v8, v5 ; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_or_b32_e32 v1, v10, v6 -; CI-NEXT: v_or_b32_e32 v0, v8, v7 -; CI-NEXT: v_or_b32_e32 v3, v14, v9 -; CI-NEXT: v_or_b32_e32 v2, v12, v11 +; CI-NEXT: v_or_b32_e32 v3, v14, v6 +; CI-NEXT: v_or_b32_e32 v2, v12, v7 ; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm @@ -3018,29 +3022,31 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; VI-NEXT: s_add_u32 s4, s2, 48 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v9, s3 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: s_add_u32 s2, s2, 16 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: v_mov_b32_e32 v12, s2 ; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; VI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v17, s3 +; VI-NEXT: v_mov_b32_e32 v16, s2 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; VI-NEXT: v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_sdwa v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; VI-NEXT: v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v18, v4 +; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v10, v10 @@ -3051,19 +3057,17 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; VI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; VI-NEXT: v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_or_b32_e32 v1, v2, v3 -; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: v_or_b32_e32 v0, v0, v18 ; VI-NEXT: v_or_b32_e32 v3, v6, v7 -; VI-NEXT: v_or_b32_e32 v2, v18, v17 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_or_b32_e32 v2, v4, v5 ; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_or_b32_e32 v1, v10, v11 ; VI-NEXT: v_or_b32_e32 v0, v8, v9 ; VI-NEXT: v_or_b32_e32 v3, v14, v15 ; VI-NEXT: v_or_b32_e32 v2, v12, v13 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir index 78f21ef6610f2..477ca578871aa 100644 --- a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir +++ b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir @@ -1,6 +1,6 @@ # REQUIRES: asserts -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -passes=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -amdgpu-use-amdgpu-trackers=0 -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -passes=machine-scheduler -amdgpu-use-amdgpu-trackers=0 -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -amdgpu-use-amdgpu-trackers=1 -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN-GCNTRACKER %s # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -passes=machine-scheduler -amdgpu-use-amdgpu-trackers=1 -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN-GCNTRACKER %s diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 420f003d4f417..f5a9f3581be74 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -29,38 +29,39 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v6, s67, 17 ; CHECK-NEXT: v_writelane_b32 v6, s68, 18 ; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: v_writelane_b32 v6, s69, 19 ; CHECK-NEXT: s_mov_b32 s68, 0 ; CHECK-NEXT: s_mov_b32 s69, s4 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; CHECK-NEXT: s_load_dwordx8 s[24:31], s[68:69], 0x30 -; CHECK-NEXT: s_load_dwordx16 s[52:67], s[68:69], 0xf0 -; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x130 +; CHECK-NEXT: s_load_dwordx16 s[4:19], s[68:69], 0xf0 ; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v6, s70, 20 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_writelane_b32 v6, s71, 21 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_writelane_b32 v7, s4, 0 +; CHECK-NEXT: v_writelane_b32 v7, s5, 1 +; CHECK-NEXT: v_writelane_b32 v7, s6, 2 +; CHECK-NEXT: v_writelane_b32 v7, s7, 3 +; CHECK-NEXT: v_writelane_b32 v7, s8, 4 +; CHECK-NEXT: v_writelane_b32 v7, s9, 5 +; CHECK-NEXT: v_writelane_b32 v7, s10, 6 +; CHECK-NEXT: v_writelane_b32 v7, s11, 7 +; CHECK-NEXT: v_writelane_b32 v7, s12, 8 +; CHECK-NEXT: v_writelane_b32 v7, s13, 9 +; CHECK-NEXT: v_writelane_b32 v7, s14, 10 +; CHECK-NEXT: v_writelane_b32 v7, s15, 11 +; CHECK-NEXT: v_writelane_b32 v7, s16, 12 +; CHECK-NEXT: v_writelane_b32 v7, s17, 13 +; CHECK-NEXT: v_writelane_b32 v7, s18, 14 +; CHECK-NEXT: v_writelane_b32 v7, s19, 15 +; CHECK-NEXT: s_load_dwordx16 s[52:67], s[68:69], 0x130 +; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x1f0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b32 s70, s68 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: v_writelane_b32 v7, s8, 0 -; CHECK-NEXT: v_writelane_b32 v7, s9, 1 -; CHECK-NEXT: v_writelane_b32 v7, s10, 2 -; CHECK-NEXT: v_writelane_b32 v7, s11, 3 -; CHECK-NEXT: v_writelane_b32 v7, s12, 4 -; CHECK-NEXT: v_writelane_b32 v7, s13, 5 -; CHECK-NEXT: v_writelane_b32 v7, s14, 6 -; CHECK-NEXT: v_writelane_b32 v7, s15, 7 -; CHECK-NEXT: v_writelane_b32 v7, s16, 8 -; CHECK-NEXT: v_writelane_b32 v7, s17, 9 -; CHECK-NEXT: v_writelane_b32 v7, s18, 10 -; CHECK-NEXT: v_writelane_b32 v7, s19, 11 -; CHECK-NEXT: v_writelane_b32 v7, s20, 12 -; CHECK-NEXT: v_writelane_b32 v7, s21, 13 -; CHECK-NEXT: v_writelane_b32 v7, s22, 14 -; CHECK-NEXT: v_writelane_b32 v7, s23, 15 ; CHECK-NEXT: v_writelane_b32 v7, s52, 16 ; CHECK-NEXT: v_writelane_b32 v7, s53, 17 ; CHECK-NEXT: v_writelane_b32 v7, s54, 18 @@ -77,62 +78,43 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v7, s65, 29 ; CHECK-NEXT: v_writelane_b32 v7, s66, 30 ; CHECK-NEXT: v_writelane_b32 v7, s67, 31 -; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x1f0 ; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0x2f0 ; CHECK-NEXT: s_mov_b32 s69, s68 -; CHECK-NEXT: s_mov_b32 s70, s68 ; CHECK-NEXT: s_mov_b32 s71, s68 -; CHECK-NEXT: v_mov_b32_e32 v3, v2 +; CHECK-NEXT: v_readlane_b32 s60, v7, 8 +; CHECK-NEXT: v_readlane_b32 s61, v7, 9 +; CHECK-NEXT: v_readlane_b32 s62, v7, 10 +; CHECK-NEXT: v_readlane_b32 s63, v7, 11 +; CHECK-NEXT: v_readlane_b32 s64, v7, 12 +; CHECK-NEXT: v_readlane_b32 s65, v7, 13 +; CHECK-NEXT: v_readlane_b32 s66, v7, 14 +; CHECK-NEXT: v_readlane_b32 s67, v7, 15 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: image_sample_lz v4, v[2:3], s[52:59], s[68:71] dmask:0x1 +; CHECK-NEXT: v_and_b32_e32 v5, 1, v0 ; CHECK-NEXT: v_readlane_b32 s52, v7, 0 ; CHECK-NEXT: v_readlane_b32 s53, v7, 1 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s54, v7, 2 ; CHECK-NEXT: v_readlane_b32 s55, v7, 3 ; CHECK-NEXT: v_readlane_b32 s56, v7, 4 ; CHECK-NEXT: v_readlane_b32 s57, v7, 5 ; CHECK-NEXT: v_readlane_b32 s58, v7, 6 ; CHECK-NEXT: v_readlane_b32 s59, v7, 7 -; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 -; CHECK-NEXT: v_and_b32_e32 v5, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v5 -; CHECK-NEXT: v_readlane_b32 s60, v7, 8 -; CHECK-NEXT: v_readlane_b32 s61, v7, 9 -; CHECK-NEXT: image_sample_lz v4, v[2:3], s[52:59], s[68:71] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s62, v7, 10 -; CHECK-NEXT: v_readlane_b32 s63, v7, 11 -; CHECK-NEXT: v_readlane_b32 s64, v7, 12 -; CHECK-NEXT: v_readlane_b32 s65, v7, 13 -; CHECK-NEXT: v_readlane_b32 s66, v7, 14 -; CHECK-NEXT: v_readlane_b32 s67, v7, 15 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mul_f32_e32 v0, v4, v1 ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb48 -; CHECK-NEXT: v_readlane_b32 s60, v7, 24 -; CHECK-NEXT: v_readlane_b32 s61, v7, 25 -; CHECK-NEXT: v_readlane_b32 s62, v7, 26 -; CHECK-NEXT: v_readlane_b32 s63, v7, 27 -; CHECK-NEXT: v_readlane_b32 s64, v7, 28 -; CHECK-NEXT: v_readlane_b32 s65, v7, 29 -; CHECK-NEXT: v_readlane_b32 s66, v7, 30 -; CHECK-NEXT: v_readlane_b32 s67, v7, 31 +; CHECK-NEXT: image_sample_lz v3, v[2:3], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 -; CHECK-NEXT: v_readlane_b32 s52, v7, 16 -; CHECK-NEXT: v_readlane_b32 s53, v7, 17 -; CHECK-NEXT: v_readlane_b32 s54, v7, 18 -; CHECK-NEXT: image_sample_lz v3, v[2:3], s[60:67], s[68:71] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s55, v7, 19 -; CHECK-NEXT: v_readlane_b32 s56, v7, 20 -; CHECK-NEXT: v_readlane_b32 s57, v7, 21 -; CHECK-NEXT: v_readlane_b32 s58, v7, 22 -; CHECK-NEXT: v_readlane_b32 s59, v7, 23 ; CHECK-NEXT: .LBB0_2: ; %bb50 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: image_sample_lz v4, v[1:2], s[16:23], s[28:31] dmask:0x1 -; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: image_sample_lz v1, v[1:2], s[44:51], s[68:71] dmask:0x1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_sub_f32_e32 v1, v1, v4 @@ -144,14 +126,17 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %bb32 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_and_saveexec_b64 s[16:17], s[4:5] ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[16:17] ; CHECK-NEXT: s_cbranch_execz .LBB0_6 ; CHECK-NEXT: ; %bb.5: ; %bb43 ; CHECK-NEXT: s_mov_b32 s16, 0 ; CHECK-NEXT: s_mov_b32 s17, s16 -; CHECK-NEXT: v_mov_b32_e32 v0, s16 +; CHECK-NEXT: v_mov_b32_e32 v2, s16 +; CHECK-NEXT: v_mov_b32_e32 v3, s17 +; CHECK-NEXT: s_mov_b32 s18, s16 +; CHECK-NEXT: s_mov_b32 s19, s16 +; CHECK-NEXT: image_sample_lz v1, v[2:3], s[52:59], s[16:19] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s52, v7, 24 ; CHECK-NEXT: v_readlane_b32 s53, v7, 25 ; CHECK-NEXT: v_readlane_b32 s54, v7, 26 @@ -160,39 +145,20 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s57, v7, 29 ; CHECK-NEXT: v_readlane_b32 s58, v7, 30 ; CHECK-NEXT: v_readlane_b32 s59, v7, 31 -; CHECK-NEXT: v_mov_b32_e32 v1, s17 -; CHECK-NEXT: s_mov_b32 s18, s16 -; CHECK-NEXT: s_mov_b32 s19, s16 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_readlane_b32 s44, v7, 16 ; CHECK-NEXT: v_readlane_b32 s45, v7, 17 ; CHECK-NEXT: v_readlane_b32 s46, v7, 18 ; CHECK-NEXT: v_readlane_b32 s47, v7, 19 ; CHECK-NEXT: v_readlane_b32 s48, v7, 20 +; CHECK-NEXT: image_sample_lz v0, v[2:3], s[52:59], s[24:27] dmask:0x1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: v_readlane_b32 s49, v7, 21 ; CHECK-NEXT: v_readlane_b32 s50, v7, 22 ; CHECK-NEXT: v_readlane_b32 s51, v7, 23 -; CHECK-NEXT: v_readlane_b32 s52, v7, 8 -; CHECK-NEXT: v_readlane_b32 s53, v7, 9 -; CHECK-NEXT: v_readlane_b32 s54, v7, 10 -; CHECK-NEXT: v_readlane_b32 s55, v7, 11 -; CHECK-NEXT: v_readlane_b32 s56, v7, 12 -; CHECK-NEXT: v_readlane_b32 s57, v7, 13 -; CHECK-NEXT: v_readlane_b32 s58, v7, 14 -; CHECK-NEXT: v_readlane_b32 s59, v7, 15 -; CHECK-NEXT: image_sample_lz v2, v[0:1], s[44:51], s[16:19] dmask:0x1 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v4, v3 -; CHECK-NEXT: v_readlane_b32 s44, v7, 0 -; CHECK-NEXT: v_readlane_b32 s45, v7, 1 -; CHECK-NEXT: image_sample_lz v0, v[0:1], s[52:59], s[24:27] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s46, v7, 2 -; CHECK-NEXT: v_readlane_b32 s47, v7, 3 -; CHECK-NEXT: v_readlane_b32 s48, v7, 4 -; CHECK-NEXT: v_readlane_b32 s49, v7, 5 -; CHECK-NEXT: v_readlane_b32 s50, v7, 6 -; CHECK-NEXT: v_readlane_b32 s51, v7, 7 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx3 v[2:4], off, s[16:19], 0 +; CHECK-NEXT: buffer_store_dwordx3 v[1:3], off, s[16:19], 0 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 @@ -209,6 +175,7 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b32 s19, s16 ; CHECK-NEXT: v_mov_b32_e32 v2, s21 ; CHECK-NEXT: image_sample_lz v3, v[1:2], s[8:15], s[16:19] dmask:0x1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[16:19] dmask:0x1 ; CHECK-NEXT: s_and_b64 vcc, exec, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 8fcf1ad3fbc95..0fd95a77e3d0f 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1105,13 +1105,16 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; GENERIC-LABEL: extract_neg_offset_sgpr_loaded: ; GENERIC: ; %bb.0: ; %entry ; GENERIC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 -; GENERIC-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x29 ; GENERIC-NEXT: s_load_dword s2, s[4:5], 0x39 +; GENERIC-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x29 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_or_b32 s6, s23, s51 -; GENERIC-NEXT: s_or_b32 s7, s22, s50 -; GENERIC-NEXT: s_or_b32 s21, s21, s49 -; GENERIC-NEXT: s_or_b32 s20, s20, s48 +; GENERIC-NEXT: s_addk_i32 s2, 0xfe00 +; GENERIC-NEXT: s_or_b32 s4, s23, s51 +; GENERIC-NEXT: s_or_b32 s5, s22, s50 +; GENERIC-NEXT: s_or_b32 s6, s21, s49 +; GENERIC-NEXT: s_or_b32 s7, s20, s48 ; GENERIC-NEXT: s_or_b32 s19, s19, s47 ; GENERIC-NEXT: s_or_b32 s18, s18, s46 ; GENERIC-NEXT: s_or_b32 s17, s17, s45 @@ -1124,42 +1127,38 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; GENERIC-NEXT: s_or_b32 s10, s10, s38 ; GENERIC-NEXT: s_or_b32 s8, s8, s36 ; GENERIC-NEXT: s_or_b32 s9, s9, s37 -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GENERIC-NEXT: s_mov_b32 s3, 0xf000 -; GENERIC-NEXT: s_addk_i32 s2, 0xfe00 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 1 -; GENERIC-NEXT: s_cselect_b32 s4, s9, s8 +; GENERIC-NEXT: s_cselect_b32 s8, s9, s8 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 2 -; GENERIC-NEXT: s_cselect_b32 s4, s10, s4 +; GENERIC-NEXT: s_cselect_b32 s8, s10, s8 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 3 -; GENERIC-NEXT: s_cselect_b32 s4, s11, s4 +; GENERIC-NEXT: s_cselect_b32 s8, s11, s8 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 4 -; GENERIC-NEXT: s_cselect_b32 s4, s12, s4 +; GENERIC-NEXT: s_cselect_b32 s8, s12, s8 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 5 -; GENERIC-NEXT: s_cselect_b32 s4, s13, s4 +; GENERIC-NEXT: s_cselect_b32 s8, s13, s8 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 6 -; GENERIC-NEXT: s_cselect_b32 s4, s14, s4 +; GENERIC-NEXT: s_cselect_b32 s8, s14, s8 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 7 -; GENERIC-NEXT: s_cselect_b32 s4, s15, s4 +; GENERIC-NEXT: s_cselect_b32 s8, s15, s8 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 8 -; GENERIC-NEXT: s_cselect_b32 s4, s16, s4 +; GENERIC-NEXT: s_cselect_b32 s8, s16, s8 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 9 -; GENERIC-NEXT: s_cselect_b32 s4, s17, s4 +; GENERIC-NEXT: s_cselect_b32 s8, s17, s8 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 10 -; GENERIC-NEXT: s_cselect_b32 s4, s18, s4 +; GENERIC-NEXT: s_cselect_b32 s8, s18, s8 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 11 -; GENERIC-NEXT: s_cselect_b32 s4, s19, s4 +; GENERIC-NEXT: s_cselect_b32 s8, s19, s8 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 12 -; GENERIC-NEXT: s_cselect_b32 s4, s20, s4 +; GENERIC-NEXT: s_cselect_b32 s7, s7, s8 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 13 -; GENERIC-NEXT: s_cselect_b32 s4, s21, s4 +; GENERIC-NEXT: s_cselect_b32 s6, s6, s7 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 14 -; GENERIC-NEXT: s_cselect_b32 s4, s7, s4 +; GENERIC-NEXT: s_cselect_b32 s5, s5, s6 ; GENERIC-NEXT: s_cmp_eq_u32 s2, 15 -; GENERIC-NEXT: s_cselect_b32 s4, s6, s4 +; GENERIC-NEXT: s_cselect_b32 s4, s4, s5 ; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: v_mov_b32_e32 v0, s4 -; GENERIC-NEXT: s_waitcnt lgkmcnt(0) ; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GENERIC-NEXT: s_endpgm ; @@ -1273,9 +1272,9 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; SI-MOVREL-NEXT: s_or_b32 s8, s8, s36 -; SI-MOVREL-NEXT: s_or_b32 s6, s23, s51 -; SI-MOVREL-NEXT: s_or_b32 s7, s22, s50 -; SI-MOVREL-NEXT: s_or_b32 s21, s21, s49 +; SI-MOVREL-NEXT: s_or_b32 s5, s23, s51 +; SI-MOVREL-NEXT: s_or_b32 s6, s22, s50 +; SI-MOVREL-NEXT: s_or_b32 s7, s21, s49 ; SI-MOVREL-NEXT: s_or_b32 s20, s20, s48 ; SI-MOVREL-NEXT: s_or_b32 s19, s19, s47 ; SI-MOVREL-NEXT: s_or_b32 s18, s18, s46 @@ -1302,9 +1301,9 @@ define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, ; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s18 ; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s19 ; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s20 -; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s21 -; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s7 -; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s7 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s6 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s5 ; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0 ; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-MOVREL-NEXT: s_endpgm @@ -3612,8 +3611,6 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 15 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v15 -; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, v14 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, v13 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, v12 @@ -3629,6 +3626,8 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) ; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off @@ -5653,94 +5652,94 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; GENERIC-NEXT: v_mov_b32_e32 v2, 0 ; GENERIC-NEXT: s_mov_b32 s27, s3 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: buffer_load_dword v14, v[1:2], s[24:27], 0 addr64 glc +; GENERIC-NEXT: buffer_load_dword v2, v[1:2], s[24:27], 0 addr64 glc ; GENERIC-NEXT: s_waitcnt vmcnt(0) +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_mov_b32 s2, -1 ; GENERIC-NEXT: ;;#ASMSTART ; GENERIC-NEXT: v_mov_b32 v1, 62 ; GENERIC-NEXT: ;;#ASMEND -; GENERIC-NEXT: v_mov_b32_e32 v10, s22 -; GENERIC-NEXT: v_mov_b32_e32 v11, s23 -; GENERIC-NEXT: v_mov_b32_e32 v15, s16 -; GENERIC-NEXT: v_mov_b32_e32 v2, s18 -; GENERIC-NEXT: v_mov_b32_e32 v3, s19 -; GENERIC-NEXT: v_mov_b32_e32 v4, s12 -; GENERIC-NEXT: v_mov_b32_e32 v5, s13 -; GENERIC-NEXT: v_mov_b32_e32 v6, s14 -; GENERIC-NEXT: v_mov_b32_e32 v7, s15 -; GENERIC-NEXT: v_mov_b32_e32 v8, s8 -; GENERIC-NEXT: v_mov_b32_e32 v9, s9 -; GENERIC-NEXT: v_mov_b32_e32 v12, s10 -; GENERIC-NEXT: v_mov_b32_e32 v13, s11 -; GENERIC-NEXT: v_add_i32_e32 v18, vcc, 1, v14 -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 +; GENERIC-NEXT: v_mov_b32_e32 v3, s20 +; GENERIC-NEXT: v_mov_b32_e32 v4, s21 +; GENERIC-NEXT: v_mov_b32_e32 v5, s22 +; GENERIC-NEXT: v_mov_b32_e32 v6, s23 +; GENERIC-NEXT: v_mov_b32_e32 v7, s16 +; GENERIC-NEXT: v_mov_b32_e32 v8, s17 +; GENERIC-NEXT: v_mov_b32_e32 v9, s18 +; GENERIC-NEXT: v_mov_b32_e32 v10, s19 +; GENERIC-NEXT: v_mov_b32_e32 v11, s12 +; GENERIC-NEXT: v_mov_b32_e32 v12, s13 +; GENERIC-NEXT: v_mov_b32_e32 v13, s14 +; GENERIC-NEXT: v_mov_b32_e32 v14, s15 +; GENERIC-NEXT: v_mov_b32_e32 v15, s8 +; GENERIC-NEXT: v_mov_b32_e32 v16, s9 +; GENERIC-NEXT: v_mov_b32_e32 v17, s10 +; GENERIC-NEXT: v_mov_b32_e32 v18, s11 +; GENERIC-NEXT: v_add_i32_e32 v19, vcc, 1, v2 +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc +; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19 ; GENERIC-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19 ; GENERIC-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc -; GENERIC-NEXT: v_mov_b32_e32 v16, s17 -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 -; GENERIC-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 -; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 -; GENERIC-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] -; GENERIC-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc -; GENERIC-NEXT: v_mov_b32_e32 v19, s20 -; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 -; GENERIC-NEXT: v_mov_b32_e32 v15, s21 -; GENERIC-NEXT: v_cmp_eq_u32_e64 s[0:1], 13, v14 -; GENERIC-NEXT: v_cndmask_b32_e64 v14, v15, v1, s[0:1] -; GENERIC-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v18 -; GENERIC-NEXT: v_cndmask_b32_e64 v15, 63, v14, s[0:1] -; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GENERIC-NEXT: s_mov_b32 s2, -1 -; GENERIC-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc -; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 -; GENERIC-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc +; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 +; GENERIC-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) ; GENERIC-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 ; GENERIC-NEXT: s_waitcnt vmcnt(0) @@ -6192,98 +6191,97 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; SI-MOVREL-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: buffer_load_dword v14, v[1:2], s[8:11], 0 addr64 glc +; SI-MOVREL-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-MOVREL-NEXT: ;;#ASMSTART ; SI-MOVREL-NEXT: v_mov_b32 v1, 62 ; SI-MOVREL-NEXT: ;;#ASMEND ; SI-MOVREL-NEXT: s_mov_b32 s2, -1 ; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s18 -; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s19 -; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s12 -; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s13 -; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s14 -; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s15 -; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s8 -; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s9 -; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s10 -; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s11 -; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s22 -; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s23 -; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s16 -; SI-MOVREL-NEXT: v_add_i32_e32 v18, vcc, 1, v14 -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s20 +; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s21 +; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s22 +; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s23 +; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s16 +; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s17 +; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s18 +; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s19 +; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s12 +; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s13 +; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s14 +; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s15 +; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s8 +; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s9 +; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s10 +; SI-MOVREL-NEXT: v_mov_b32_e32 v18, s11 +; SI-MOVREL-NEXT: v_add_i32_e32 v19, vcc, 1, v2 +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc +; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19 ; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 -; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s17 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 -; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] -; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s21 -; SI-MOVREL-NEXT: v_cmp_eq_u32_e64 s[0:1], 13, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc -; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 -; SI-MOVREL-NEXT: v_cndmask_b32_e64 v14, v15, v1, s[0:1] -; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e64 v15, 63, v14, s[0:1] -; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-MOVREL-NEXT: v_mov_b32_e32 v19, s20 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc -; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 -; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc +; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 +; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc ; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 ; SI-MOVREL-NEXT: s_waitcnt vmcnt(0) ; SI-MOVREL-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 @@ -6304,104 +6302,104 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; VI-NEXT: flat_load_dword v14, v[1:2] glc +; VI-NEXT: flat_load_dword v2, v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s18 -; VI-NEXT: v_mov_b32_e32 v3, s19 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: ;;#ASMSTART ; VI-NEXT: v_mov_b32 v1, 62 ; VI-NEXT: ;;#ASMEND -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: v_mov_b32_e32 v6, s14 -; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v8, s8 -; VI-NEXT: v_mov_b32_e32 v9, s9 -; VI-NEXT: v_mov_b32_e32 v12, s10 -; VI-NEXT: v_mov_b32_e32 v13, s11 -; VI-NEXT: v_mov_b32_e32 v10, s22 -; VI-NEXT: v_mov_b32_e32 v11, s23 -; VI-NEXT: v_mov_b32_e32 v15, s16 -; VI-NEXT: v_add_u32_e32 v18, vcc, 1, v14 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 -; VI-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 -; VI-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 -; VI-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 -; VI-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 -; VI-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 -; VI-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; VI-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 -; VI-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 -; VI-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 -; VI-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 -; VI-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 -; VI-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 -; VI-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; VI-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 -; VI-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 -; VI-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 -; VI-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 -; VI-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 -; VI-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 -; VI-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 -; VI-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 -; VI-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 -; VI-NEXT: v_mov_b32_e32 v16, s17 -; VI-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 -; VI-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 -; VI-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 -; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 -; VI-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 -; VI-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] -; VI-NEXT: v_mov_b32_e32 v15, s21 -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 13, v14 -; VI-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 -; VI-NEXT: v_cndmask_b32_e64 v14, v15, v1, s[0:1] -; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v18 -; VI-NEXT: v_cndmask_b32_e64 v15, 63, v14, s[0:1] -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v19, s20 -; VI-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 -; VI-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s20 +; VI-NEXT: v_mov_b32_e32 v4, s21 +; VI-NEXT: v_mov_b32_e32 v5, s22 +; VI-NEXT: v_mov_b32_e32 v6, s23 +; VI-NEXT: v_mov_b32_e32 v7, s16 +; VI-NEXT: v_mov_b32_e32 v8, s17 +; VI-NEXT: v_mov_b32_e32 v9, s18 +; VI-NEXT: v_mov_b32_e32 v10, s19 +; VI-NEXT: v_mov_b32_e32 v11, s12 +; VI-NEXT: v_mov_b32_e32 v12, s13 +; VI-NEXT: v_mov_b32_e32 v13, s14 +; VI-NEXT: v_mov_b32_e32 v14, s15 +; VI-NEXT: v_mov_b32_e32 v15, s8 +; VI-NEXT: v_mov_b32_e32 v16, s9 +; VI-NEXT: v_mov_b32_e32 v17, s10 +; VI-NEXT: v_mov_b32_e32 v18, s11 ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 1, v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2 +; VI-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2 +; VI-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2 +; VI-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2 +; VI-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2 +; VI-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2 +; VI-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2 +; VI-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2 +; VI-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; VI-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; VI-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; VI-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; VI-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; VI-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19 +; VI-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19 +; VI-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19 +; VI-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19 +; VI-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19 +; VI-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19 +; VI-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19 +; VI-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19 +; VI-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19 +; VI-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19 +; VI-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19 +; VI-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19 +; VI-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19 +; VI-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19 +; VI-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19 ; VI-NEXT: v_mov_b32_e32 v19, s3 ; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[18:19], v[14:17] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6433,104 +6431,103 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) ; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-IDXMODE-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[30:31], s[4:5], 0x24 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: global_load_dword v14, v1, s[0:1] glc +; GFX9-IDXMODE-NEXT: global_load_dword v3, v1, s[0:1] glc ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, s11 ; GFX9-IDXMODE-NEXT: ;;#ASMSTART ; GFX9-IDXMODE-NEXT: v_mov_b32 v1, 62 ; GFX9-IDXMODE-NEXT: ;;#ASMEND -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s15 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s9 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s10 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s11 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s22 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s23 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s16 -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, v2, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, v3, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v19, v4, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, v9, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v14 -; GFX9-IDXMODE-NEXT: v_add_u32_e32 v18, 1, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, v12, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, v13, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s18 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s19 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s12 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s14 +; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s15 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v3 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[2:3], 14, v3 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[28:29], 3, v3 +; GFX9-IDXMODE-NEXT: v_add_u32_e32 v20, 1, v3 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[0:1], 13, v3 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[26:27], 2, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v23, v6, v1, s[2:3] +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v6, v19, v1, s[28:29] +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v20 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v22, v5, v1, s[0:1] +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v5, v18, v1, s[26:27] +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v20 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[22:23], 0, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, v17, v1, s[24:25] ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v18 +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 1, v20 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[4:5], 15, v3 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[6:7], 8, v3 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[8:9], 9, v3 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[10:11], 10, v3 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[12:13], 11, v3 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v3 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[16:17], 5, v3 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[18:19], 6, v3 +; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[20:21], 7, v3 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, v16, v1, s[22:23] ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 1, v18 +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v27, v10, v1, s[10:11] +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v10, v15, v1, s[20:21] ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 63, v7, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 63, v6, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 63, v20, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 63, v19, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 63, v17, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 63, v16, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v19, v10, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, v11, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v14 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s17 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, v16, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, 63, v11, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v18 -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v18 +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v26, v9, v1, s[8:9] +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v9, v14, v1, s[18:19] ; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v17, 63, v17, s[0:1] -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s21 -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[0:1], 13, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, 63, v19, vcc -; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v14 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v14, v15, v1, s[0:1] -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v18 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v15, 63, v14, s[0:1] -; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, s20 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, v19, v1, vcc -; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v18 -; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, 0 -; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v25, v8, v1, s[6:7] +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v8, v13, v1, s[16:17] +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v24, v7, v1, s[4:5] +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v7, v12, v1, s[14:15] +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[12:13] +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 63, v11, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 63, v27, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 15, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v18, 63, v24, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 13, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc +; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v20 +; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc ; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] offset:48 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[15:18], s[30:31] offset:48 ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[10:13], s[0:1] offset:32 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[11:14], s[30:31] offset:32 ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] offset:16 +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[7:10], s[30:31] offset:16 ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) -; GFX9-IDXMODE-NEXT: global_store_dwordx4 v18, v[2:5], s[0:1] +; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[3:6], s[30:31] ; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0) ; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-IDXMODE-NEXT: s_cbranch_execz .LBB17_2 @@ -6565,134 +6562,132 @@ bb2: define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) { ; GENERIC-LABEL: insert_w_offset_multiple_in_block: ; GENERIC: ; %bb.0: ; %entry -; GENERIC-NEXT: s_load_dwordx2 s[28:29], s[4:5], 0x9 -; GENERIC-NEXT: s_load_dword s24, s[4:5], 0xb -; GENERIC-NEXT: s_mov_b32 s31, 0xf000 -; GENERIC-NEXT: s_mov_b32 s30, -1 -; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41500000 -; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41880000 -; GENERIC-NEXT: v_mov_b32_e32 v2, 0x41600000 -; GENERIC-NEXT: v_mov_b32_e32 v3, 0x41700000 -; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41800000 -; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41100000 -; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41200000 -; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41300000 -; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41400000 +; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GENERIC-NEXT: s_load_dword s4, s[4:5], 0xb +; GENERIC-NEXT: s_mov_b32 s3, 0xf000 +; GENERIC-NEXT: s_mov_b32 s2, -1 +; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41500000 +; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41880000 +; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41600000 +; GENERIC-NEXT: v_mov_b32_e32 v2, 0x41700000 +; GENERIC-NEXT: v_mov_b32_e32 v3, 0x41800000 +; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41100000 +; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41200000 +; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41300000 +; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41400000 ; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000 ; GENERIC-NEXT: v_mov_b32_e32 v10, 0x40c00000 ; GENERIC-NEXT: v_mov_b32_e32 v11, 0x40e00000 ; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41000000 ; GENERIC-NEXT: v_mov_b32_e32 v15, 0x40400000 ; GENERIC-NEXT: s_waitcnt lgkmcnt(0) -; GENERIC-NEXT: s_add_i32 s25, s24, 1 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 12 +; GENERIC-NEXT: s_add_i32 s5, s4, 1 +; GENERIC-NEXT: s_cmp_eq_u32 s5, 12 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 13 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 14 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 15 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 8 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 9 +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 10 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 11 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 4 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 5 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, v10, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 6 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 7 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v12, v12, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 0 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v13, 1.0, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 1 +; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v14, 2.0, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 2 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v15, v15, v8, vcc +; GENERIC-NEXT: s_cmp_eq_u32 s5, 3 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v16, 4.0, v8, vcc +; GENERIC-NEXT: s_add_i32 s4, s4, 2 +; GENERIC-NEXT: s_cmp_lg_u32 s4, 3 +; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 ; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 13 -; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 14 -; GENERIC-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 15 -; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 8 -; GENERIC-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 9 -; GENERIC-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 10 -; GENERIC-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 11 -; GENERIC-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 4 -; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 5 -; GENERIC-NEXT: s_cselect_b64 s[16:17], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 6 -; GENERIC-NEXT: s_cselect_b64 s[18:19], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 7 -; GENERIC-NEXT: s_cselect_b64 s[20:21], -1, 0 -; GENERIC-NEXT: s_cmp_eq_u32 s25, 0 -; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v13, 1.0, v0, s[22:23] -; GENERIC-NEXT: s_cmp_eq_u32 s25, 1 -; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v14, 2.0, v0, s[22:23] -; GENERIC-NEXT: s_cmp_eq_u32 s25, 2 -; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[22:23] -; GENERIC-NEXT: s_cmp_eq_u32 s25, 3 -; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v16, 4.0, v0, s[22:23] -; GENERIC-NEXT: s_add_i32 s26, s24, 2 -; GENERIC-NEXT: s_cmp_lg_u32 s26, 3 -; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[28:31], 0 -; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GENERIC-NEXT: s_cmp_lg_u32 s26, 2 -; GENERIC-NEXT: s_cselect_b64 s[24:25], -1, 0 -; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[22:23] -; GENERIC-NEXT: v_cndmask_b32_e64 v15, v0, v15, s[24:25] -; GENERIC-NEXT: s_cmp_lg_u32 s26, 1 -; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v14, v0, v14, s[22:23] -; GENERIC-NEXT: s_cmp_lg_u32 s26, 0 -; GENERIC-NEXT: s_cselect_b64 s[22:23], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v13, v0, v13, s[22:23] -; GENERIC-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[14:15] -; GENERIC-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[16:17] -; GENERIC-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[18:19] -; GENERIC-NEXT: v_cndmask_b32_e64 v12, v12, v0, s[20:21] -; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[28:31], 0 offset:16 -; GENERIC-NEXT: s_cmp_lg_u32 s26, 7 -; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GENERIC-NEXT: s_cmp_lg_u32 s26, 6 -; GENERIC-NEXT: s_cselect_b64 s[16:17], -1, 0 ; GENERIC-NEXT: s_waitcnt expcnt(0) -; GENERIC-NEXT: v_cndmask_b32_e64 v12, v0, v12, s[14:15] -; GENERIC-NEXT: v_cndmask_b32_e64 v11, v0, v11, s[16:17] -; GENERIC-NEXT: s_cmp_lg_u32 s26, 5 -; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v10, v0, v10, s[14:15] -; GENERIC-NEXT: s_cmp_lg_u32 s26, 4 -; GENERIC-NEXT: s_cselect_b64 s[14:15], -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e64 v9, v0, v9, s[14:15] -; GENERIC-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GENERIC-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[0:1] -; GENERIC-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[2:3] -; GENERIC-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[4:5] -; GENERIC-NEXT: v_cndmask_b32_e64 v5, v5, v0, s[6:7] -; GENERIC-NEXT: buffer_store_dwordx4 v[1:4], off, s[28:31], 0 offset:48 -; GENERIC-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[8:9] -; GENERIC-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[10:11] -; GENERIC-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[12:13] -; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[28:31], 0 offset:32 -; GENERIC-NEXT: s_cmp_lg_u32 s26, 11 -; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[28:31], 0 offset:80 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: s_waitcnt expcnt(1) -; GENERIC-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s26, 10 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v7, v0, v7, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s26, 9 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v6, v0, v6, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s26, 8 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s26, 15 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s26, 14 -; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[28:31], 0 offset:96 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s26, 13 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GENERIC-NEXT: s_cmp_lg_u32 s26, 12 -; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 -; GENERIC-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GENERIC-NEXT: buffer_store_dwordx4 v[1:4], off, s[28:31], 0 offset:112 -; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[28:31], 0 offset:64 +; GENERIC-NEXT: v_cndmask_b32_e32 v16, v8, v16, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 2 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v15, v8, v15, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 1 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v14, v8, v14, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 0 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v13, v8, v13, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 7 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 6 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v11, v8, v11, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 5 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 4 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 11 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 10 +; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:80 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 9 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 8 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 15 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 14 +; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 13 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; GENERIC-NEXT: s_cmp_lg_u32 s4, 12 +; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0 +; GENERIC-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64 ; GENERIC-NEXT: s_endpgm ; ; NOOPT-LABEL: insert_w_offset_multiple_in_block: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 4a89b2fcc017c..8c9862e86a565 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -860,16 +860,14 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0x124 ; GCN-NEXT: v_mov_b32_e32 v24, s0 -; GCN-NEXT: s_load_dword s0, s[4:5], 0x124 ; GCN-NEXT: v_mov_b32_e32 v25, s1 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 ; GCN-NEXT: v_mov_b32_e32 v4, s12 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 m0, s0, 1 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NEXT: v_mov_b32_e32 v6, s14 ; GCN-NEXT: v_mov_b32_e32 v7, s15 @@ -885,6 +883,8 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: v_mov_b32_e32 v17, s25 ; GCN-NEXT: v_mov_b32_e32 v18, s26 ; GCN-NEXT: v_mov_b32_e32 v19, s27 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 m0, s8, 1 ; GCN-NEXT: v_mov_b32_e32 v20, s28 ; GCN-NEXT: v_mov_b32_e32 v21, s29 ; GCN-NEXT: v_mov_b32_e32 v22, s30 @@ -894,7 +894,6 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> ; GCN-NEXT: v_mov_b32_e32 v28, s6 ; GCN-NEXT: v_mov_b32_e32 v29, s7 ; GCN-NEXT: v_movreld_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-NEXT: v_movreld_b32_e32 v1, v32 ; GCN-NEXT: s_addc_u32 s3, s1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index b81fdd36530da..e8bdd4cb467ad 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -1597,108 +1597,108 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; SI-LABEL: v_insertelement_v16bf16_dynamic: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0 -; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x4 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; SI-NEXT: s_mov_b32 s11, 0x100f000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[0:1], s[14:15] +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v5, 0 -; SI-NEXT: buffer_load_dwordx4 v[7:10], v[4:5], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 offset:16 -; SI-NEXT: s_cmp_eq_u32 s7, 6 -; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: buffer_load_dwordx4 v[7:10], v[4:5], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 offset:16 +; SI-NEXT: s_cmp_eq_u32 s5, 6 +; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 7 -; SI-NEXT: s_mov_b64 s[14:15], s[2:3] +; SI-NEXT: s_cmp_eq_u32 s5, 7 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cndmask_b32_e32 v11, v10, v6, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 4 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 5 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; SI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 2 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 3 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s5, 4 ; SI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; SI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s5, 5 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; SI-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s5, 2 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[0:1] -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s5, 3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; SI-NEXT: v_cndmask_b32_e64 v12, v13, v6, s[2:3] -; SI-NEXT: s_cmp_eq_u32 s7, 1 +; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s5, 0 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_cndmask_b32_e32 v11, v13, v6, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s5, 1 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 14 -; SI-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[4:5] -; SI-NEXT: v_or_b32_e32 v8, v8, v12 -; SI-NEXT: v_cndmask_b32_e32 v12, v14, v6, vcc +; SI-NEXT: s_cmp_eq_u32 s5, 14 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_cndmask_b32_e32 v11, v14, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 15 +; SI-NEXT: s_cmp_eq_u32 s5, 15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 12 -; SI-NEXT: v_or_b32_e32 v7, v7, v12 -; SI-NEXT: v_cndmask_b32_e32 v12, v15, v6, vcc +; SI-NEXT: s_cmp_eq_u32 s5, 12 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_cndmask_b32_e32 v11, v15, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 13 +; SI-NEXT: s_cmp_eq_u32 s5, 13 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 10 -; SI-NEXT: v_or_b32_e32 v3, v3, v12 -; SI-NEXT: v_cndmask_b32_e32 v12, v16, v6, vcc +; SI-NEXT: s_cmp_eq_u32 s5, 10 +; SI-NEXT: v_or_b32_e32 v3, v3, v11 +; SI-NEXT: v_cndmask_b32_e32 v11, v16, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 11 +; SI-NEXT: s_cmp_eq_u32 s5, 11 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v2, v2, v12 -; SI-NEXT: v_cndmask_b32_e32 v12, v17, v6, vcc +; SI-NEXT: s_cmp_eq_u32 s5, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 +; SI-NEXT: v_cndmask_b32_e32 v11, v17, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s7, 9 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; SI-NEXT: s_cmp_eq_u32 s5, 9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v1, v1, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 ; SI-NEXT: v_or_b32_e32 v0, v0, v6 -; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[12:15], 0 addr64 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[7:10], v[4:5], s[12:15], 0 addr64 +; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[7:10], v[4:5], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v16bf16_dynamic: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 @@ -1715,80 +1715,80 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_cmp_eq_u32 s7, 14 +; VI-NEXT: s_cmp_eq_u32 s5, 14 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc -; VI-NEXT: v_mov_b32_e32 v12, s6 +; VI-NEXT: v_mov_b32_e32 v12, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 15 +; VI-NEXT: s_cmp_eq_u32 s5, 15 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 12 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 13 +; VI-NEXT: s_cmp_eq_u32 s5, 12 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 13 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 10 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 11 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 10 +; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 11 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 8 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] -; VI-NEXT: s_cmp_eq_u32 s7, 9 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 8 +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 9 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 6 -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s5, 6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v14, v16, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 7 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] +; VI-NEXT: s_cmp_eq_u32 s5, 7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 4 -; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s5, 4 +; VI-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v14, v17, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 5 -; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; VI-NEXT: s_cmp_eq_u32 s5, 5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 2 -; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s5, 2 +; VI-NEXT: v_or_b32_sdwa v7, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v14, v18, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 3 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; VI-NEXT: s_cmp_eq_u32 s5, 3 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 0 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s5, 0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_or_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v14, v19, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 1 -; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_cmp_eq_u32 s5, 1 +; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -1858,17 +1858,17 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; GFX900-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[18:19] ; GFX900-NEXT: v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX900-NEXT: s_mov_b64 vcc, s[24:25] -; GFX900-NEXT: v_perm_b32 v3, v3, v10, s30 -; GFX900-NEXT: v_cndmask_b32_e64 v10, v5, v9, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v16, v5, v9, s[22:23] ; GFX900-NEXT: v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX900-NEXT: s_mov_b64 vcc, s[28:29] -; GFX900-NEXT: v_perm_b32 v2, v2, v11, s30 -; GFX900-NEXT: v_cndmask_b32_e64 v11, v4, v9, s[26:27] +; GFX900-NEXT: v_cndmask_b32_e64 v17, v4, v9, s[26:27] ; GFX900-NEXT: v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX900-NEXT: v_perm_b32 v7, v7, v14, s30 ; GFX900-NEXT: v_perm_b32 v6, v6, v15, s30 -; GFX900-NEXT: v_perm_b32 v5, v5, v10, s30 -; GFX900-NEXT: v_perm_b32 v4, v4, v11, s30 +; GFX900-NEXT: v_perm_b32 v5, v5, v16, s30 +; GFX900-NEXT: v_perm_b32 v4, v4, v17, s30 +; GFX900-NEXT: v_perm_b32 v3, v3, v10, s30 +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s30 ; GFX900-NEXT: v_perm_b32 v1, v1, v12, s30 ; GFX900-NEXT: v_perm_b32 v0, v0, v13, s30 ; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[36:37] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 92ea83fdfb982..45268a1e00eef 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -3305,17 +3305,17 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX9-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[18:19] ; GFX9-NEXT: v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_mov_b64 vcc, s[24:25] -; GFX9-NEXT: v_perm_b32 v3, v3, v10, s30 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v5, v9, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v16, v5, v9, s[22:23] ; GFX9-NEXT: v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_mov_b64 vcc, s[28:29] -; GFX9-NEXT: v_perm_b32 v2, v2, v11, s30 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v4, v9, s[26:27] +; GFX9-NEXT: v_cndmask_b32_e64 v17, v4, v9, s[26:27] ; GFX9-NEXT: v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v7, v14, s30 ; GFX9-NEXT: v_perm_b32 v6, v6, v15, s30 -; GFX9-NEXT: v_perm_b32 v5, v5, v10, s30 -; GFX9-NEXT: v_perm_b32 v4, v4, v11, s30 +; GFX9-NEXT: v_perm_b32 v5, v5, v16, s30 +; GFX9-NEXT: v_perm_b32 v4, v4, v17, s30 +; GFX9-NEXT: v_perm_b32 v3, v3, v10, s30 +; GFX9-NEXT: v_perm_b32 v2, v2, v11, s30 ; GFX9-NEXT: v_perm_b32 v1, v1, v12, s30 ; GFX9-NEXT: v_perm_b32 v0, v0, v13, s30 ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[36:37] offset:16 @@ -3325,7 +3325,7 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; VI-LABEL: v_insertelement_v16f16_dynamic: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 @@ -3342,80 +3342,80 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_cmp_eq_u32 s7, 14 +; VI-NEXT: s_cmp_eq_u32 s5, 14 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc -; VI-NEXT: v_mov_b32_e32 v12, s6 +; VI-NEXT: v_mov_b32_e32 v12, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 15 +; VI-NEXT: s_cmp_eq_u32 s5, 15 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 12 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 13 +; VI-NEXT: s_cmp_eq_u32 s5, 12 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 13 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 10 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 11 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 10 +; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 11 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 -; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 8 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] -; VI-NEXT: s_cmp_eq_u32 s7, 9 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 8 +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 9 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 6 -; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s5, 6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v14, v16, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 7 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] +; VI-NEXT: s_cmp_eq_u32 s5, 7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 4 -; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s5, 4 +; VI-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v14, v17, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 5 -; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; VI-NEXT: s_cmp_eq_u32 s5, 5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 2 -; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s5, 2 +; VI-NEXT: v_or_b32_sdwa v7, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v14, v18, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 3 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; VI-NEXT: s_cmp_eq_u32 s5, 3 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 0 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc +; VI-NEXT: s_cmp_eq_u32 s5, 0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_or_b32_sdwa v6, v6, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v14, v19, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 1 -; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_cmp_eq_u32 s5, 1 +; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -3452,101 +3452,101 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 ; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; CI-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1] ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 11 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc -; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3] ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 10 -; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; CI-NEXT: v_or_b32_e32 v9, v9, v12 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; CI-NEXT: v_or_b32_e32 v8, v8, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v15 ; CI-NEXT: s_cmp_eq_u32 s5, 9 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 8 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v16 -; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 7 +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 6 -; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cndmask_b32_e32 v15, v15, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 5 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 4 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc +; CI-NEXT: v_or_b32_e32 v10, v10, v11 +; CI-NEXT: v_cndmask_b32_e32 v11, v16, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3] ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_or_b32_e32 v2, v2, v11 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: v_or_b32_e32 v10, v10, v11 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; CI-NEXT: v_or_b32_e32 v7, v7, v12 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; CI-NEXT: s_cmp_eq_u32 s5, 3 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_or_b32_e32 v3, v3, v12 +; CI-NEXT: v_or_b32_e32 v9, v9, v12 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_or_b32_e32 v2, v2, v12 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; CI-NEXT: s_cmp_eq_u32 s5, 3 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 2 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc +; CI-NEXT: v_or_b32_e32 v7, v7, v12 +; CI-NEXT: v_cndmask_b32_e32 v12, v17, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 1 ; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; CI-NEXT: v_or_b32_e32 v1, v1, v6 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; CI-NEXT: v_or_b32_e32 v8, v8, v13 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; CI-NEXT: v_or_b32_e32 v1, v1, v6 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; CI-NEXT: v_or_b32_e32 v3, v3, v13 ; CI-NEXT: v_or_b32_e32 v0, v0, v6 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 5e2cec504c6a9..3a168b2d4e3ef 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -6398,46 +6398,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX7-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] -; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v0, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9] +; GFX7-GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v4, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v17, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v15, v5, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v7, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v4, v[8:9] +; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v0, v15 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v3, vcc, v12, v16, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v19, vcc, v2, v17 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v6, v[10:11] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, v10, v13, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v2 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v6, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v20, vcc, v13, v18, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v1, v5, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v19, v7, v[11:12] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v3, v4, v[14:15] +; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v20, v6, v[16:17] +; GFX7-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v14, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v2, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v13, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v9, v[5:6] +; GFX7-GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v8 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v11, v[7:8] +; GFX7-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v10 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v14, v[0:1] ; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3] -; GFX7-GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v16 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v12, vcc, v11, v17, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4] -; GFX7-GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v1 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v2 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7] -; GFX7-GISEL-NEXT: v_add_i32_e64 v16, s[4:5], 1, v8 -; GFX7-GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v13, s[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v2, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v17, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v15, v[7:8] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v16, v[12:13] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6497,46 +6497,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] -; GFX8-GISEL-NEXT: v_add_u32_e32 v1, vcc, v0, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9] +; GFX8-GISEL-NEXT: v_add_u32_e32 v15, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v4, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v17, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v15, v5, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v7, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v4, v[8:9] +; GFX8-GISEL-NEXT: v_add_u32_e32 v1, vcc, v0, v15 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, v12, v16, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v19, vcc, v2, v17 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v6, v[10:11] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, v10, v13, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v2 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v6, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v20, vcc, v13, v18, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v1, v5, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v19, v7, v[11:12] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v3, v4, v[14:15] +; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v20, v6, v[16:17] +; GFX8-GISEL-NEXT: v_add_u32_e32 v2, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v14, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v2, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v13, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v9, v[5:6] +; GFX8-GISEL-NEXT: v_add_u32_e32 v15, vcc, 1, v8 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v11, v[7:8] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v10 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v14, v[0:1] ; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3] -; GFX8-GISEL-NEXT: v_add_u32_e32 v9, vcc, v1, v16 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v12, vcc, v11, v17, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4] -; GFX8-GISEL-NEXT: v_add_u32_e32 v9, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 1, v1 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v15, vcc, 1, v2 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7] -; GFX8-GISEL-NEXT: v_add_u32_e64 v16, s[4:5], 1, v8 -; GFX8-GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v13, s[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v2, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v17, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v15, v[7:8] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v16, v[12:13] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6588,46 +6588,46 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX900-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX900-GISEL: ; %bb.0: ; %entry ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v1, vcc, v0, v12 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v4, v[8:9] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v1, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v3, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v4, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v17, v6, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v15, v5, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v7, v[3:4] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v4, v[8:9] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v1, vcc, v0, v15 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v12, v16, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v19, vcc, v2, v17 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v6, v[10:11] ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v10, v13, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v2 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v5, v[9:10] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v6, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v6, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v20, vcc, v13, v18, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v1, v5, v[9:10] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v19, v7, v[11:12] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v3, v4, v[14:15] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v12, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v20, v6, v[16:17] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v14, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v2, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v13, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v9, v[5:6] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v8 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v11, v[7:8] +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v18, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v10 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v14, v[0:1] ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v4, v[11:12] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v7, v[2:3] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v9, vcc, v1, v16 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v6, v[3:4] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v12, vcc, v11, v17, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[3:4] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v9, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v10, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v1 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v6, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v11, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v3, v[5:6] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v9, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v2 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, v[6:7] -; GFX900-GISEL-NEXT: v_add_co_u32_e64 v16, s[4:5], 1, v8 -; GFX900-GISEL-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, v13, s[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v9, v[2:3] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v10, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v15, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v14, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v6, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v17, v[3:4] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v16, v[9:10] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v15, v[11:12] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v2, v[8:9] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v6, v17, v[3:4] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v15, v[7:8] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v16, v[12:13] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir b/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir index eaf669da83ead..cd1a0f394d4da 100644 --- a/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir +++ b/llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir @@ -11,26 +11,28 @@ body: | ; REG_ALLOC-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11 ; REG_ALLOC-NEXT: {{ $}} - ; REG_ALLOC-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; REG_ALLOC-NEXT: renamable $vgpr15_vgpr16_vgpr17_vgpr18 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr15_vgpr16_vgpr17_vgpr18 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) ; REG_ALLOC-NEXT: renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr4, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; REG_ALLOC-NEXT: KILL killed renamable $vgpr4 ; REG_ALLOC-NEXT: KILL killed renamable $vgpr2 ; REG_ALLOC-NEXT: KILL killed renamable $vgpr0 ; REG_ALLOC-NEXT: KILL killed renamable $vgpr3 - ; REG_ALLOC-NEXT: renamable $sgpr12 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec - ; REG_ALLOC-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr4, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; REG_ALLOC-NEXT: renamable $sgpr13 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec + ; REG_ALLOC-NEXT: KILL killed renamable $sgpr8_sgpr9_sgpr10_sgpr11 + ; REG_ALLOC-NEXT: renamable $sgpr8 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec + ; REG_ALLOC-NEXT: renamable $sgpr9 = V_READFIRSTLANE_B32 killed $vgpr16, implicit $exec ; REG_ALLOC-NEXT: renamable $sgpr6_sgpr7 = V_CMP_NE_U32_e64 killed $vgpr1, 0, implicit $exec - ; REG_ALLOC-NEXT: S_CMP_EQ_U64 killed renamable $sgpr12_sgpr13, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; REG_ALLOC-NEXT: S_CMP_EQ_U64 killed renamable $sgpr8_sgpr9, killed renamable $sgpr2_sgpr3, implicit-def $scc ; REG_ALLOC-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; REG_ALLOC-NEXT: renamable $vgpr8 = IMPLICIT_DEF + ; REG_ALLOC-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; REG_ALLOC-NEXT: $exec = S_MOV_B64_term renamable $sgpr6_sgpr7 ; REG_ALLOC-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; REG_ALLOC-NEXT: S_BRANCH %bb.2 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.1: ; REG_ALLOC-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000300 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr6_sgpr7, implicit-def $exec, implicit-def $scc, implicit $exec ; REG_ALLOC-NEXT: $exec = S_XOR_B64_term $exec, renamable $sgpr2_sgpr3, implicit-def $scc @@ -42,33 +44,33 @@ body: | ; REG_ALLOC-NEXT: liveins: $sgpr0, $sgpr1, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: renamable $sgpr1 = S_OR_B32 killed renamable $sgpr1, 2, implicit-def dead $scc - ; REG_ALLOC-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 + ; REG_ALLOC-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 ; REG_ALLOC-NEXT: renamable $vgpr11_vgpr12 = IMPLICIT_DEF - ; REG_ALLOC-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; REG_ALLOC-NEXT: renamable $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 = IMPLICIT_DEF ; REG_ALLOC-NEXT: S_BRANCH %bb.1 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.3: ; REG_ALLOC-NEXT: successors: %bb.5(0x80000000) - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000300 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: renamable $sgpr1 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec - ; REG_ALLOC-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr4, implicit $exec + ; REG_ALLOC-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec ; REG_ALLOC-NEXT: S_CMP_EQ_U32 killed renamable $sgpr6, killed renamable $sgpr1, implicit-def $scc ; REG_ALLOC-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; REG_ALLOC-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 + ; REG_ALLOC-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 ; REG_ALLOC-NEXT: S_BRANCH %bb.5 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.4: - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (<4 x s32>), addrspace 4) - ; REG_ALLOC-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec - ; REG_ALLOC-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr8, killed renamable $vgpr0, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; REG_ALLOC-NEXT: renamable $vgpr1 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec + ; REG_ALLOC-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) ; REG_ALLOC-NEXT: S_ENDPGM 0 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: bb.5: ; REG_ALLOC-NEXT: successors: %bb.4(0x80000000) - ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 + ; REG_ALLOC-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 ; REG_ALLOC-NEXT: {{ $}} ; REG_ALLOC-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc ; REG_ALLOC-NEXT: S_BRANCH %bb.4 @@ -78,26 +80,28 @@ body: | ; DEAD_INST_DEL-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11 ; DEAD_INST_DEL-NEXT: {{ $}} - ; DEAD_INST_DEL-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; DEAD_INST_DEL-NEXT: renamable $vgpr15_vgpr16_vgpr17_vgpr18 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr15_vgpr16_vgpr17_vgpr18 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr3, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr2, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) ; DEAD_INST_DEL-NEXT: renamable $vgpr11_vgpr12_vgpr13_vgpr14 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr4, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr4 ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr2 ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr0 ; DEAD_INST_DEL-NEXT: KILL killed renamable $vgpr3 - ; DEAD_INST_DEL-NEXT: renamable $sgpr12 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec - ; DEAD_INST_DEL-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr4, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; DEAD_INST_DEL-NEXT: renamable $sgpr13 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec + ; DEAD_INST_DEL-NEXT: KILL killed renamable $sgpr8_sgpr9_sgpr10_sgpr11 + ; DEAD_INST_DEL-NEXT: renamable $sgpr8 = V_READFIRSTLANE_B32 killed $vgpr15, implicit $exec + ; DEAD_INST_DEL-NEXT: renamable $sgpr9 = V_READFIRSTLANE_B32 killed $vgpr16, implicit $exec ; DEAD_INST_DEL-NEXT: renamable $sgpr6_sgpr7 = V_CMP_NE_U32_e64 killed $vgpr1, 0, implicit $exec - ; DEAD_INST_DEL-NEXT: S_CMP_EQ_U64 killed renamable $sgpr12_sgpr13, killed renamable $sgpr2_sgpr3, implicit-def $scc + ; DEAD_INST_DEL-NEXT: S_CMP_EQ_U64 killed renamable $sgpr8_sgpr9, killed renamable $sgpr2_sgpr3, implicit-def $scc ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; DEAD_INST_DEL-NEXT: renamable $vgpr8 = IMPLICIT_DEF + ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; DEAD_INST_DEL-NEXT: $exec = S_MOV_B64_term renamable $sgpr6_sgpr7 ; DEAD_INST_DEL-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.2 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.1: ; DEAD_INST_DEL-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000300 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr6_sgpr7, implicit-def $exec, implicit-def $scc, implicit $exec ; DEAD_INST_DEL-NEXT: $exec = S_XOR_B64_term $exec, renamable $sgpr2_sgpr3, implicit-def $scc @@ -109,33 +113,33 @@ body: | ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $sgpr1, $vgpr10, $sgpr4_sgpr5, $sgpr6_sgpr7 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = S_OR_B32 killed renamable $sgpr1, 2, implicit-def dead $scc - ; DEAD_INST_DEL-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 + ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 ; DEAD_INST_DEL-NEXT: renamable $vgpr11_vgpr12 = IMPLICIT_DEF - ; DEAD_INST_DEL-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; DEAD_INST_DEL-NEXT: renamable $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 = IMPLICIT_DEF ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.1 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.3: ; DEAD_INST_DEL-NEXT: successors: %bb.5(0x80000000) - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7:0x0000000000000300 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr11_vgpr12_vgpr13_vgpr14:0x0000000000000003, $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8:0x0000000000000300 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = V_READFIRSTLANE_B32 killed $vgpr11, implicit $exec - ; DEAD_INST_DEL-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr4, implicit $exec + ; DEAD_INST_DEL-NEXT: renamable $sgpr6 = V_READFIRSTLANE_B32 killed $vgpr5, implicit $exec ; DEAD_INST_DEL-NEXT: S_CMP_EQ_U32 killed renamable $sgpr6, killed renamable $sgpr1, implicit-def $scc ; DEAD_INST_DEL-NEXT: renamable $sgpr1 = S_CSELECT_B32 1, 0, implicit $scc - ; DEAD_INST_DEL-NEXT: renamable $vgpr8 = COPY killed renamable $sgpr1 + ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr1 ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.5 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.4: - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr4_sgpr5 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr4_sgpr5 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (<4 x s32>), addrspace 4) - ; DEAD_INST_DEL-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec - ; DEAD_INST_DEL-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr8, killed renamable $vgpr0, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) + ; DEAD_INST_DEL-NEXT: renamable $vgpr1 = V_ADD_U32_e64 killed $sgpr0, killed $vgpr10, 0, implicit $exec + ; DEAD_INST_DEL-NEXT: BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8) ; DEAD_INST_DEL-NEXT: S_ENDPGM 0 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: bb.5: ; DEAD_INST_DEL-NEXT: successors: %bb.4(0x80000000) - ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr8, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 + ; DEAD_INST_DEL-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10, $sgpr2_sgpr3, $sgpr4_sgpr5 ; DEAD_INST_DEL-NEXT: {{ $}} ; DEAD_INST_DEL-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc ; DEAD_INST_DEL-NEXT: S_BRANCH %bb.4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index 689d1472d6010..bc3c973179df3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -6,143 +6,142 @@ define amdgpu_kernel void @largeInterleave() #0 { ret void } ; GCN-LABEL: largeInterleave: ; GCN: ; %bb.0: - ; GCN-NEXT: ; implicit-def: $vgpr16 - ; GCN-NEXT: ; implicit-def: $vgpr25 - ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) - ; GCN-NEXT: v_readfirstlane_b32 s17, v16 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: ; implicit-def: $vgpr17 - ; GCN-NEXT: ; implicit-def: $sgpr15 + ; GCN-NEXT: ; implicit-def: $vgpr0 + ; GCN-NEXT: ; implicit-def: $vgpr3 + ; GCN-NEXT: ; implicit-def: $vgpr1 + ; GCN-NEXT: ; implicit-def: $vgpr2 + ; GCN-NEXT: ; implicit-def: $vgpr187 + ; GCN-NEXT: ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155 + ; GCN-NEXT: ; implicit-def: $vgpr74 + ; GCN-NEXT: ; implicit-def: $vgpr75 + ; GCN-NEXT: ; implicit-def: $vgpr76 + ; GCN-NEXT: ; implicit-def: $vgpr77 + ; GCN-NEXT: ; implicit-def: $vgpr72 + ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + ; GCN-NEXT: v_add_u32_e32 v182, v72, v77 + ; GCN-NEXT: v_readfirstlane_b32 s7, v0 + ; GCN-NEXT: ; implicit-def: $sgpr5 ; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN-NEXT: s_lshl_b32 s18, s17, 7 - ; GCN-NEXT: ; implicit-def: $vgpr18 - ; GCN-NEXT: v_add_lshl_u32 v230, v18, s18, 1 - ; GCN-NEXT: v_lshl_add_u32 v25, s17, 4, v25 - ; GCN-NEXT: v_mul_lo_u32 v25, v25, s6 - ; GCN-NEXT: v_add_lshl_u32 v226, v25, v17, 1 - ; GCN-NEXT: v_add_u32_e32 v17, s15, v226 - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v226, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: v_add_u32_e32 v183, v72, v74 + ; GCN-NEXT: v_add_u32_e32 v184, v72, v75 + ; GCN-NEXT: v_lshl_add_u32 v0, s7, 4, v3 + ; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 + ; GCN-NEXT: v_add_lshl_u32 v168, v0, v1, 1 + ; GCN-NEXT: v_add_u32_e32 v0, s5, v168 + ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v168, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v17, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v0, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v72, 64, v17 - ; GCN-NEXT: ; implicit-def: $vgpr213 - ; GCN-NEXT: ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155 - ; GCN-NEXT: ; implicit-def: $vgpr246 - ; GCN-NEXT: v_add_u32_e32 v188, 0x80, v17 + ; GCN-NEXT: s_lshl_b32 s5, s7, 7 + ; GCN-NEXT: v_add_lshl_u32 v228, v2, s5, 1 + ; GCN-NEXT: v_add_u32_e32 v73, 64, v0 + ; GCN-NEXT: v_add_u32_e32 v169, 0x80, v0 + ; GCN-NEXT: v_add_u32_e32 v224, 0xc0, v0 + ; GCN-NEXT: ; implicit-def: $vgpr0 + ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: ; implicit-def: $vgpr1 + ; GCN-NEXT: v_add_u32_e32 v181, v72, v76 + ; GCN-NEXT: v_add_u32_e32 v0, s7, v0 + ; GCN-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 + ; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 + ; GCN-NEXT: v_add_lshl_u32 v175, v1, v0, 1 + ; GCN-NEXT: ; implicit-def: $vgpr0 + ; GCN-NEXT: v_lshl_add_u32 v173, v0, 1, v175 + ; GCN-NEXT: ; implicit-def: $vgpr0 + ; GCN-NEXT: v_lshl_add_u32 v174, v0, 1, v173 + ; GCN-NEXT: ; implicit-def: $vgpr0 + ; GCN-NEXT: ; implicit-def: $vgpr74 + ; GCN-NEXT: ; implicit-def: $vgpr75 + ; GCN-NEXT: ; implicit-def: $vgpr76 + ; GCN-NEXT: ; implicit-def: $vgpr77 + ; GCN-NEXT: ; implicit-def: $vgpr4 + ; GCN-NEXT: ; implicit-def: $vgpr5 + ; GCN-NEXT: ; implicit-def: $vgpr6 + ; GCN-NEXT: ; implicit-def: $vgpr7 + ; GCN-NEXT: v_add_u32_e32 v225, v72, v4 + ; GCN-NEXT: v_add_u32_e32 v226, v72, v5 + ; GCN-NEXT: v_add_u32_e32 v227, v72, v6 + ; GCN-NEXT: v_add_u32_e32 v229, v72, v7 + ; GCN-NEXT: v_lshl_add_u32 v176, v0, 1, v174 + ; GCN-NEXT: ; implicit-def: $vgpr0 + ; GCN-NEXT: ; implicit-def: $vgpr1 + ; GCN-NEXT: ; implicit-def: $vgpr2 + ; GCN-NEXT: ; implicit-def: $vgpr3 + ; GCN-NEXT: v_add_u32_e32 v243, v72, v3 + ; GCN-NEXT: v_add_u32_e32 v244, v72, v0 + ; GCN-NEXT: v_add_u32_e32 v245, v72, v1 + ; GCN-NEXT: v_add_u32_e32 v246, v72, v2 + ; GCN-NEXT: v_add_u32_e32 v178, v72, v77 + ; GCN-NEXT: v_add_u32_e32 v179, v72, v74 + ; GCN-NEXT: v_add_u32_e32 v180, v72, v75 + ; GCN-NEXT: v_add_u32_e32 v177, v72, v76 + ; GCN-NEXT: ; implicit-def: $vgpr242 ; GCN-NEXT: ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159 + ; GCN-NEXT: ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151 ; GCN-NEXT: ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147 - ; GCN-NEXT: ; implicit-def: $vgpr19 - ; GCN-NEXT: ; implicit-def: $vgpr26 - ; GCN-NEXT: ; implicit-def: $vgpr27 - ; GCN-NEXT: v_add_u32_e32 v227, 0xc0, v17 - ; GCN-NEXT: v_add_u32_e32 v231, v19, v26 - ; GCN-NEXT: v_add_u32_e32 v232, v19, v27 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; implicit-def: $vgpr28 - ; GCN-NEXT: ; implicit-def: $vgpr29 - ; GCN-NEXT: v_add_u32_e32 v233, v19, v28 - ; GCN-NEXT: v_add_u32_e32 v234, v19, v29 - ; GCN-NEXT: ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143 + ; GCN-NEXT: ; implicit-def: $sgpr12 ; GCN-NEXT: ; implicit-def: $sgpr5 - ; GCN-NEXT: ; implicit-def: $sgpr7 - ; GCN-NEXT: ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151 ; GCN-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139 ; GCN-NEXT: ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135 - ; GCN-NEXT: ; implicit-def: $vgpr20 - ; GCN-NEXT: v_add_u32_e32 v18, s17, v20 - ; GCN-NEXT: v_and_b32_e32 v18, 0x1fffffff, v18 - ; GCN-NEXT: ; implicit-def: $sgpr16 - ; GCN-NEXT: v_mul_lo_u32 v18, v18, s16 - ; GCN-NEXT: ; implicit-def: $vgpr21 - ; GCN-NEXT: v_add_lshl_u32 v199, v21, v18, 1 - ; GCN-NEXT: ; implicit-def: $vgpr22 - ; GCN-NEXT: v_lshl_add_u32 v200, v22, 1, v199 - ; GCN-NEXT: ; implicit-def: $vgpr23 - ; GCN-NEXT: v_lshl_add_u32 v201, v23, 1, v200 - ; GCN-NEXT: ; implicit-def: $vgpr24 - ; GCN-NEXT: v_lshl_add_u32 v202, v24, 1, v201 - ; GCN-NEXT: ; implicit-def: $vgpr16 - ; GCN-NEXT: ; implicit-def: $vgpr18 - ; GCN-NEXT: ; implicit-def: $vgpr20 - ; GCN-NEXT: ; implicit-def: $vgpr24 - ; GCN-NEXT: v_add_u32_e32 v247, v19, v24 - ; GCN-NEXT: v_add_u32_e32 v248, v19, v16 - ; GCN-NEXT: v_add_u32_e32 v249, v19, v18 - ; GCN-NEXT: v_add_u32_e32 v250, v19, v20 + ; GCN-NEXT: ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143 ; GCN-NEXT: ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131 - ; GCN-NEXT: ; implicit-def: $sgpr14 - ; GCN-NEXT: ; implicit-def: $vgpr196 - ; GCN-NEXT: ; implicit-def: $sgpr12_sgpr13 - ; GCN-NEXT: ; implicit-def: $vgpr211 - ; GCN-NEXT: v_max_f32_e32 v212, v211, v211 - ; GCN-NEXT: ; implicit-def: $vgpr198 - ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN-NEXT: ; implicit-def: $vgpr32 - ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: ; implicit-def: $vgpr34 - ; GCN-NEXT: v_add_u32_e32 v210, v19, v34 - ; GCN-NEXT: v_add_u32_e32 v206, v19, v33 - ; GCN-NEXT: v_add_u32_e32 v205, v19, v32 + ; GCN-NEXT: ; implicit-def: $sgpr13 + ; GCN-NEXT: ; implicit-def: $vgpr170 + ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 + ; GCN-NEXT: ; implicit-def: $vgpr185 + ; GCN-NEXT: v_max_f32_e32 v186, v185, v185 + ; GCN-NEXT: ; implicit-def: $vgpr172 ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 - ; GCN-NEXT: ; implicit-def: $vgpr21 - ; GCN-NEXT: ; implicit-def: $vgpr22 - ; GCN-NEXT: ; implicit-def: $vgpr23 - ; GCN-NEXT: ; implicit-def: $vgpr30 - ; GCN-NEXT: ; implicit-def: $vgpr31 - ; GCN-NEXT: v_add_u32_e32 v207, v19, v21 - ; GCN-NEXT: v_add_u32_e32 v208, v19, v22 - ; GCN-NEXT: v_add_u32_e32 v209, v19, v23 - ; GCN-NEXT: v_add_u32_e32 v203, v19, v30 - ; GCN-NEXT: v_add_u32_e32 v204, v19, v31 - ; GCN-NEXT: ; kill: killed $vgpr17 ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 - ; GCN-NEXT: ; implicit-def: $vgpr197 + ; GCN-NEXT: ; implicit-def: $vgpr171 ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v230, v[64:67] + ; GCN-NEXT: ds_write_b128 v228, v[64:67] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v230, v[68:71] offset:1024 + ; GCN-NEXT: ds_write_b128 v228, v[68:71] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v226, s[8:11], 0 offen offset:64 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v168, s[8:11], 0 offen offset:64 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[164:167], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[164:167], v73, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[64:67], v213 + ; GCN-NEXT: ds_read_b128 v[64:67], v187 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[64:65], v[152:153], 0 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[66:67], v[154:155], v[112:127] - ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:512 + ; GCN-NEXT: ds_read_b128 v[64:67], v187 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[64:65], v[152:153], 0 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[66:67], v[154:155], v[96:111] - ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:1024 + ; GCN-NEXT: ds_read_b128 v[64:67], v187 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[168:171], v213 offset:1536 + ; GCN-NEXT: ds_read_b128 v[188:191], v187 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[172:175], v246 + ; GCN-NEXT: ds_read_b128 v[192:195], v242 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:512 + ; GCN-NEXT: ds_read_b128 v[196:199], v242 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[180:183], v246 offset:1024 + ; GCN-NEXT: ds_read_b128 v[200:203], v242 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1536 + ; GCN-NEXT: ds_read_b128 v[204:207], v242 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART @@ -150,316 +149,309 @@ ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v230, v[160:163] + ; GCN-NEXT: ds_write_b128 v228, v[160:163] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v230, v[164:167] offset:1024 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79] + ; GCN-NEXT: ds_write_b128 v228, v[164:167] offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[154:155], v[64:79] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:128 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v168, s[8:11], 0 offen offset:128 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v188, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v169, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[188:191], v213 + ; GCN-NEXT: ds_read_b128 v[164:167], v187 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[192:195], v213 offset:512 + ; GCN-NEXT: ds_read_b128 v[188:191], v187 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[164:167], v213 offset:1024 + ; GCN-NEXT: ds_read_b128 v[208:211], v187 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[214:217], v213 offset:1536 + ; GCN-NEXT: ds_read_b128 v[212:215], v187 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[172:173], v[156:157], v[112:127] - ; GCN-NEXT: ds_read_b128 v[218:221], v246 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[192:193], v[156:157], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[194:195], v[158:159], v[112:127] + ; GCN-NEXT: ds_read_b128 v[192:195], v242 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[222:225], v246 offset:512 + ; GCN-NEXT: ds_read_b128 v[216:219], v242 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[168:171], v246 offset:1024 + ; GCN-NEXT: ds_read_b128 v[220:223], v242 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[174:175], v[158:159], v[112:127] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[188:189], v[144:145], v[112:127] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[190:191], v[146:147], v[112:127] - ; GCN-NEXT: ds_read_b128 v[188:191], v246 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[164:165], v[148:149], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[166:167], v[150:151], v[112:127] + ; GCN-NEXT: ds_read_b128 v[164:167], v242 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v230, v[152:155] + ; GCN-NEXT: ds_write_b128 v228, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v230, v[160:163] offset:1024 + ; GCN-NEXT: ds_write_b128 v228, v[160:163] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v168, s[8:11], 0 offen offset:192 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79] - ; GCN-NEXT: buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v224, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[160:161], v231, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[192:193], v[144:145], v[112:127] + ; GCN-NEXT: buffer_load_dwordx2 v[168:169], v225, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[224:225], v226, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_perm_b32 v236, v224, v168, s12 + ; GCN-NEXT: buffer_load_dwordx2 v[226:227], v227, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[200:201], v[156:157], v[80:95] + ; GCN-NEXT: buffer_load_dwordx2 v[192:193], v229, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79] - ; GCN-NEXT: v_perm_b32 v238, v162, v160, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[140:141], v[112:127] - ; GCN-NEXT: v_perm_b32 v240, v162, v160, s7 - ; GCN-NEXT: v_perm_b32 v242, v163, v161, s5 - ; GCN-NEXT: v_perm_b32 v244, v163, v161, s7 - ; GCN-NEXT: ds_read_b128 v[160:163], v213 + ; GCN-NEXT: v_perm_b32 v238, v224, v168, s5 + ; GCN-NEXT: v_perm_b32 v240, v225, v169, s12 + ; GCN-NEXT: v_perm_b32 v168, v225, v169, s5 + ; GCN-NEXT: v_perm_b32 v237, v192, v226, s12 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[202:203], v[158:159], v[80:95] + ; GCN-NEXT: v_perm_b32 v239, v192, v226, s5 + ; GCN-NEXT: v_perm_b32 v241, v193, v227, s12 + ; GCN-NEXT: v_perm_b32 v169, v193, v227, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[196:197], v[156:157], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[208:209], v[148:149], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[198:199], v[158:159], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[210:211], v[150:151], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[204:205], v[156:157], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[188:189], v[148:149], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[220:221], v[144:145], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[206:207], v[158:159], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[190:191], v[150:151], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[194:195], v[146:147], v[112:127] + ; GCN-NEXT: ds_read_b128 v[192:195], v187 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_perm_b32 v239, v174, v172, s5 - ; GCN-NEXT: v_perm_b32 v241, v174, v172, s7 - ; GCN-NEXT: v_perm_b32 v243, v175, v173, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[144:145], v[64:79] - ; GCN-NEXT: v_perm_b32 v245, v175, v173, s7 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[176:177], v[156:157], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[220:221], v[142:143], v[112:127] - ; GCN-NEXT: ds_read_b128 v[218:221], v213 offset:512 + ; GCN-NEXT: ds_read_b128 v[200:203], v187 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[172:175], v213 offset:1024 + ; GCN-NEXT: ds_read_b128 v[208:211], v187 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[216:217], v[146:147], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[178:179], v[158:159], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[148:149], v[112:127] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[140:141], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[192:193], v[144:145], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[150:151], v[112:127] - ; GCN-NEXT: ds_read_b128 v[160:163], v213 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[222:223], v[146:147], v[80:95] + ; GCN-NEXT: ds_read_b128 v[220:223], v187 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[184:187], v246 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[212:213], v[148:149], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[216:217], v[144:145], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[208:209], v[136:137], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[192:193], v[136:137], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[150:151], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[146:147], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[210:211], v[138:139], v[80:95] + ; GCN-NEXT: ds_read_b128 v[208:211], v242 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[214:217], v246 offset:512 + ; GCN-NEXT: ds_read_b128 v[204:207], v242 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:1024 + ; GCN-NEXT: ds_read_b128 v[212:215], v242 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[142:143], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[146:147], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[148:149], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[156:157], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[136:137], v[112:127] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[222:223], v[140:141], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[150:151], v[64:79] - ; GCN-NEXT: ds_read_b128 v[160:163], v246 offset:1536 + ; GCN-NEXT: ds_read_b128 v[224:227], v242 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v230, v[152:155] + ; GCN-NEXT: ds_write_b128 v228, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v230, v[226:229] offset:1024 + ; GCN-NEXT: ds_write_b128 v228, v[160:163] offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[194:195], v[138:139], v[112:127] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[158:159], v[80:95] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[156:159], v213 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[226:229], v213 offset:512 + ; GCN-NEXT: ds_read_b128 v[152:155], v187 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[180:183], v213 offset:1024 + ; GCN-NEXT: ds_read_b128 v[228:231], v187 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[152:155], v213 offset:1536 + ; GCN-NEXT: ds_read_b128 v[232:235], v187 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[230:233], v246 + ; GCN-NEXT: ds_read_b128 v[192:195], v187 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[234:237], v246 offset:512 + ; GCN-NEXT: ds_read_b128 v[160:163], v242 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[138:139], v[112:127] - ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[164:165], v[144:145], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[200:201], v[136:137], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[208:209], v[132:133], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[166:167], v[146:147], v[64:79] + ; GCN-NEXT: ds_read_b128 v[164:167], v242 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[224:225], v[142:143], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[156:157], v[132:133], v[112:127] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[148:149], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[158:159], v[134:135], v[112:127] - ; GCN-NEXT: ds_read_b128 v[156:159], v246 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[202:203], v[138:139], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[210:211], v[134:135], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[220:221], v[136:137], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[204:205], v[132:133], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[140:141], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[212:213], v[132:133], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[206:207], v[134:135], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[222:223], v[138:139], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[142:143], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[214:215], v[134:135], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[224:225], v[132:133], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[228:229], v[140:141], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[128:129], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[232:233], v[140:141], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[226:227], v[134:135], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[230:231], v[142:143], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[130:131], v[112:127] + ; GCN-NEXT: ds_read_b128 v[160:163], v242 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[234:235], v[142:143], v[80:95] + ; GCN-NEXT: s_nop 6 + ; GCN-NEXT: v_mul_f32_e32 v152, s4, v112 + ; GCN-NEXT: v_mul_f32_e32 v153, s4, v113 + ; GCN-NEXT: v_max3_f32 v152, v152, s13, v153 + ; GCN-NEXT: v_mul_f32_e32 v153, s4, v114 + ; GCN-NEXT: v_mul_f32_e32 v154, s4, v115 + ; GCN-NEXT: v_max3_f32 v152, v152, v153, v154 + ; GCN-NEXT: v_mul_f32_e32 v153, s4, v116 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[192:193], v[140:141], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v154, s4, v117 + ; GCN-NEXT: v_max3_f32 v187, v152, v153, v154 + ; GCN-NEXT: ds_read_b128 v[152:155], v242 offset:1536 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v208, s4, v118 + ; GCN-NEXT: v_mul_f32_e32 v209, s4, v119 + ; GCN-NEXT: v_max3_f32 v187, v187, v208, v209 + ; GCN-NEXT: v_mul_f32_e32 v208, s4, v120 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[164:165], v[128:129], v[96:111] + ; GCN-NEXT: v_mul_f32_e32 v209, s4, v121 + ; GCN-NEXT: v_max3_f32 v187, v187, v208, v209 + ; GCN-NEXT: v_mul_f32_e32 v208, s4, v122 + ; GCN-NEXT: v_mul_f32_e32 v209, s4, v123 + ; GCN-NEXT: v_max3_f32 v187, v187, v208, v209 + ; GCN-NEXT: v_mul_f32_e32 v208, s4, v124 + ; GCN-NEXT: v_mul_f32_e32 v156, s4, v125 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[194:195], v[142:143], v[64:79] + ; GCN-NEXT: v_max3_f32 v156, v187, v208, v156 + ; GCN-NEXT: v_mul_f32_e32 v157, s4, v126 + ; GCN-NEXT: v_mul_f32_e32 v187, s4, v127 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v199, v[238:239] + ; GCN-NEXT: ds_write_b64 v175, v[236:237] + ; GCN-NEXT: v_max3_f32 v187, v156, v157, v187 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v200, v[240:241] + ; GCN-NEXT: ds_write_b64 v173, v[238:239] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[160:161], v[128:129], v[80:95] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v201, v[242:243] + ; GCN-NEXT: ds_write_b64 v174, v[240:241] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v202, v[244:245] + ; GCN-NEXT: ds_write_b64 v176, v[168:169] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[192:193], v247, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v243, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[150:151], v[96:111] - ; GCN-NEXT: buffer_load_dwordx2 v[194:195], v248, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[196:197], v244, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[218:219], v249, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[198:199], v245, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[220:221], v250, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[208:209], v246, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_perm_b32 v188, v194, v192, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[164:165], v[144:145], v[80:95] - ; GCN-NEXT: v_perm_b32 v189, v220, v218, s5 - ; GCN-NEXT: v_perm_b32 v191, v220, v218, s7 - ; GCN-NEXT: v_perm_b32 v190, v194, v192, s7 - ; GCN-NEXT: v_perm_b32 v192, v195, v193, s5 - ; GCN-NEXT: v_perm_b32 v194, v195, v193, s7 - ; GCN-NEXT: v_perm_b32 v193, v221, v219, s5 - ; GCN-NEXT: v_perm_b32 v195, v221, v219, s7 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[166:167], v[146:147], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[140:141], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[142:143], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[148:149], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[214:215], v[136:137], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[174:175], v[150:151], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[216:217], v[138:139], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[176:177], v[136:137], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[226:227], v[132:133], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[178:179], v[138:139], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[136:137], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[230:231], v[128:129], v[112:127] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[228:229], v[134:135], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[132:133], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[138:139], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[232:233], v[130:131], v[112:127] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[234:235], v[128:129], v[96:111] - ; GCN-NEXT: s_nop 9 - ; GCN-NEXT: v_mul_f32_e32 v213, s4, v112 - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v113 - ; GCN-NEXT: v_max3_f32 v213, v213, s14, v218 - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v114 - ; GCN-NEXT: v_mul_f32_e32 v219, s4, v115 - ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v116 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[134:135], v[80:95] - ; GCN-NEXT: v_mul_f32_e32 v219, s4, v117 - ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v118 - ; GCN-NEXT: v_mul_f32_e32 v219, s4, v119 - ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v120 - ; GCN-NEXT: v_mul_f32_e32 v219, s4, v121 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[132:133], v[64:79] - ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v122 - ; GCN-NEXT: v_mul_f32_e32 v219, s4, v123 - ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v124 - ; GCN-NEXT: v_mul_f32_e32 v219, s4, v125 - ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[236:237], v[130:131], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v126 - ; GCN-NEXT: v_mul_f32_e32 v219, s4, v127 - ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[184:185], v[128:129], v[80:95] - ; GCN-NEXT: s_nop 6 - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v96 - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v97 - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v98 - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v99 - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v100 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[154:155], v[134:135], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v101 - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v102 - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v103 - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v104 - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v105 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[186:187], v[130:131], v[80:95] - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v106 - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v107 - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v108 - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v109 - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[156:157], v[128:129], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v110 - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v111 - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 - ; GCN-NEXT: v_mul_f32_e32 v140, s4, v80 - ; GCN-NEXT: v_mul_f32_e32 v141, s4, v81 - ; GCN-NEXT: v_max3_f32 v140, v213, v140, v141 - ; GCN-NEXT: v_mul_f32_e32 v141, s4, v82 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[158:159], v[130:131], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v142, s4, v83 - ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 - ; GCN-NEXT: v_mul_f32_e32 v141, s4, v84 - ; GCN-NEXT: v_mul_f32_e32 v142, s4, v85 - ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 - ; GCN-NEXT: v_mul_f32_e32 v141, s4, v86 - ; GCN-NEXT: v_mul_f32_e32 v142, s4, v87 - ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 - ; GCN-NEXT: v_mul_f32_e32 v141, s4, v88 - ; GCN-NEXT: v_mul_f32_e32 v142, s4, v89 - ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 - ; GCN-NEXT: v_mul_f32_e32 v141, s4, v90 - ; GCN-NEXT: v_mul_f32_e32 v142, s4, v91 - ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 - ; GCN-NEXT: v_mul_f32_e32 v141, s4, v92 - ; GCN-NEXT: v_mul_f32_e32 v142, s4, v93 - ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 - ; GCN-NEXT: v_mul_f32_e32 v141, s4, v94 - ; GCN-NEXT: v_mul_f32_e32 v142, s4, v95 - ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 - ; GCN-NEXT: v_mul_f32_e32 v128, s4, v64 - ; GCN-NEXT: v_mul_f32_e32 v129, s4, v65 - ; GCN-NEXT: v_max3_f32 v128, v140, v128, v129 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[166:167], v[130:131], v[96:111] + ; GCN-NEXT: v_perm_b32 v168, v196, v158, s12 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[128:129], v[64:79] + ; GCN-NEXT: s_nop 8 + ; GCN-NEXT: v_mul_f32_e32 v128, s4, v96 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v97 + ; GCN-NEXT: v_max3_f32 v128, v187, v128, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v98 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v99 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v132 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v100 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[162:163], v[130:131], v[80:95] + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v101 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v132 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v102 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v103 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v132 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v104 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v105 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v132 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v106 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v107 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v132 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v108 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v109 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v132 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v110 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v111 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v132 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v80 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v81 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[154:155], v[130:131], v[64:79] + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v132 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v82 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v83 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v132 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v84 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v85 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v132 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v86 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v87 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v132 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v88 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v89 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v132 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v90 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v91 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v132 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v92 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v93 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v132 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v94 + ; GCN-NEXT: v_mul_f32_e32 v132, s4, v95 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v132 + ; GCN-NEXT: v_mul_f32_e32 v129, s4, v64 + ; GCN-NEXT: v_mul_f32_e32 v130, s4, v65 + ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 ; GCN-NEXT: v_mul_f32_e32 v129, s4, v66 ; GCN-NEXT: v_mul_f32_e32 v130, s4, v67 ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 @@ -481,84 +473,72 @@ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v78 ; GCN-NEXT: v_mul_f32_e32 v130, s4, v79 ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 - ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128 + ; GCN-NEXT: ds_bpermute_b32 v129, v170, v128 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[130:133], v198 + ; GCN-NEXT: ds_read_b128 v[130:133], v172 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576 + ; GCN-NEXT: ds_read_b128 v[134:137], v172 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_perm_b32 v169, v208, v198, s12 + ; GCN-NEXT: v_perm_b32 v148, v196, v158, s5 ; GCN-NEXT: v_max_f32_e32 v129, v129, v129 ; GCN-NEXT: v_max_f32_e32 v128, v128, v129 - ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128 + ; GCN-NEXT: ds_bpermute_b32 v129, v170, v128 + ; GCN-NEXT: v_perm_b32 v149, v208, v198, s5 + ; GCN-NEXT: v_perm_b32 v156, v197, v159, s12 + ; GCN-NEXT: v_perm_b32 v157, v209, v199, s12 + ; GCN-NEXT: v_perm_b32 v158, v197, v159, s5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v128, v129, v128, s[12:13] + ; GCN-NEXT: v_cndmask_b32_e64 v128, v129, v128, s[6:7] ; GCN-NEXT: v_max_f32_e32 v128, v128, v128 - ; GCN-NEXT: v_max_f32_e32 v128, v212, v128 - ; GCN-NEXT: v_fma_f32 v113, s4, v113, -v128 - ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v113 - ; GCN-NEXT: v_fma_f32 v113, s4, v114, -v128 - ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v113 - ; GCN-NEXT: v_fma_f32 v113, s4, v115, -v128 - ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v113 - ; GCN-NEXT: v_fma_f32 v113, s4, v116, -v128 - ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v113 - ; GCN-NEXT: v_fma_f32 v113, s4, v117, -v128 - ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v113 - ; GCN-NEXT: v_fma_f32 v113, s4, v118, -v128 + ; GCN-NEXT: v_max_f32_e32 v128, v186, v128 ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128 - ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v113 - ; GCN-NEXT: v_fma_f32 v113, s4, v119, -v128 - ; GCN-NEXT: v_fma_f32 v118, s4, v120, -v128 - ; GCN-NEXT: v_fma_f32 v120, s4, v121, -v128 ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112 - ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v113 - ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v120 - ; GCN-NEXT: v_fma_f32 v120, s4, v122, -v128 - ; GCN-NEXT: v_exp_f32_e32 v114, v138 - ; GCN-NEXT: v_exp_f32_e32 v115, v139 - ; GCN-NEXT: v_exp_f32_e32 v116, v140 - ; GCN-NEXT: v_exp_f32_e32 v117, v141 - ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v118 - ; GCN-NEXT: v_exp_f32_e32 v118, v142 - ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v120 - ; GCN-NEXT: v_exp_f32_e32 v120, v144 + ; GCN-NEXT: v_fma_f32 v129, s4, v113, -v128 + ; GCN-NEXT: v_fma_f32 v138, s4, v114, -v128 ; GCN-NEXT: v_exp_f32_e32 v113, v112 - ; GCN-NEXT: v_cvt_f16_f32_e32 v119, v114 - ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v116 - ; GCN-NEXT: v_sub_f32_e32 v129, v211, v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v113 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v129 - ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v122, s4, v123, -v128 - ; GCN-NEXT: v_pack_b32_f16 v146, v112, v119 - ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v115 - ; GCN-NEXT: v_mul_f32_e32 v151, 0x3fb8aa3b, v122 - ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v117 - ; GCN-NEXT: v_fma_f32 v122, s4, v124, -v128 - ; GCN-NEXT: v_pack_b32_f16 v147, v112, v121 - ; GCN-NEXT: v_exp_f32_e32 v112, v129 - ; GCN-NEXT: v_cvt_f16_f32_e32 v124, v118 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122 - ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v128 - ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v119, v143 - ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v129 + ; GCN-NEXT: v_fma_f32 v139, s4, v115, -v128 + ; GCN-NEXT: v_exp_f32_e32 v114, v112 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v138 + ; GCN-NEXT: v_exp_f32_e32 v115, v112 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_fma_f32 v140, s4, v116, -v128 + ; GCN-NEXT: v_exp_f32_e32 v116, v112 + ; GCN-NEXT: v_fma_f32 v141, s4, v117, -v128 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v140 + ; GCN-NEXT: v_fma_f32 v142, s4, v118, -v128 + ; GCN-NEXT: v_exp_f32_e32 v117, v112 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v141 + ; GCN-NEXT: v_fma_f32 v143, s4, v119, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v129, v113 + ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v114 + ; GCN-NEXT: v_fma_f32 v150, s4, v121, -v128 + ; GCN-NEXT: v_exp_f32_e32 v118, v112 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v142 + ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v115 + ; GCN-NEXT: v_fma_f32 v151, s4, v122, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v122, v116 + ; GCN-NEXT: v_exp_f32_e32 v119, v112 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v143 + ; GCN-NEXT: v_fma_f32 v138, s4, v120, -v128 + ; GCN-NEXT: v_exp_f32_e32 v120, v112 + ; GCN-NEXT: v_sub_f32_e32 v112, v185, v128 + ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112 + ; GCN-NEXT: v_exp_f32_e32 v112, v112 + ; GCN-NEXT: v_pack_b32_f16 v147, v121, v122 + ; GCN-NEXT: v_pack_b32_f16 v146, v129, v139 + ; GCN-NEXT: v_mul_f32_e32 v121, 0x3fb8aa3b, v138 + ; GCN-NEXT: ds_read_b128 v[138:141], v172 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[142:145], v172 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0] @@ -567,7 +547,7 @@ ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[130:131], v[146:147], v[32:47] ; GCN-NEXT: v_mul_f32_e64 v20, v20, v112 ; GCN-NEXT: v_mul_f32_e64 v21, v21, v112 ; GCN-NEXT: v_mul_f32_e64 v22, v22, v112 @@ -577,6 +557,25 @@ ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_fma_f32 v152, s4, v123, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v129, v117 + ; GCN-NEXT: v_fma_f32 v130, s4, v124, -v128 + ; GCN-NEXT: v_exp_f32_e32 v121, v121 + ; GCN-NEXT: v_mul_f32_e32 v122, 0x3fb8aa3b, v150 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v124, v118 + ; GCN-NEXT: v_mul_f32_e64 v0, v0, v112 + ; GCN-NEXT: v_mul_f32_e64 v1, v1, v112 + ; GCN-NEXT: v_mul_f32_e64 v2, v2, v112 + ; GCN-NEXT: v_mul_f32_e64 v3, v3, v112 + ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_mul_f32_e32 v123, 0x3fb8aa3b, v151 + ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v119 ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0] @@ -585,480 +584,469 @@ ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pack_b32_f16 v134, v123, v124 - ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v119 - ; GCN-NEXT: v_fma_f32 v124, s4, v126, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v120 - ; GCN-NEXT: v_exp_f32_e32 v121, v148 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31] - ; GCN-NEXT: v_exp_f32_e32 v122, v149 - ; GCN-NEXT: v_pack_b32_f16 v135, v130, v126 - ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v124 - ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v121 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v125 - ; GCN-NEXT: v_fma_f32 v139, s4, v96, -v128 - ; GCN-NEXT: v_fma_f32 v127, s4, v127, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v120 + ; GCN-NEXT: v_exp_f32_e32 v122, v122 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[138:139], v[146:147], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v123, v123 + ; GCN-NEXT: v_fma_f32 v134, s4, v126, -v128 + ; GCN-NEXT: v_pack_b32_f16 v126, v129, v124 + ; GCN-NEXT: v_mul_f32_e32 v124, 0x3fb8aa3b, v152 + ; GCN-NEXT: v_fma_f32 v129, s4, v127, -v128 + ; GCN-NEXT: v_pack_b32_f16 v127, v131, v135 + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v130 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v123, v150 - ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127 - ; GCN-NEXT: v_fma_f32 v143, s4, v101, -v128 - ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v128 - ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128 - ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128 - ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v124, v151 - ; GCN-NEXT: ds_read_b128 v[130:133], v197 + ; GCN-NEXT: v_exp_f32_e32 v124, v124 + ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v121 + ; GCN-NEXT: v_fma_f32 v139, s4, v96, -v128 + ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v128 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v125 + ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v134 + ; GCN-NEXT: v_perm_b32 v159, v209, v199, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[132:133], v[126:127], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v96, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v122 + ; GCN-NEXT: ds_read_b128 v[130:133], v171 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: ds_read_b128 v[150:153], v171 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] - ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v122 - ; GCN-NEXT: v_exp_f32_e32 v96, v129 - ; GCN-NEXT: v_fma_f32 v137, s4, v97, -v128 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 - ; GCN-NEXT: v_pack_b32_f16 v126, v126, v136 - ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v123 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] + ; GCN-NEXT: v_fma_f32 v154, s4, v103, -v128 + ; GCN-NEXT: v_pack_b32_f16 v142, v138, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v123 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[126:127], v[16:31] + ; GCN-NEXT: v_fma_f32 v136, s4, v97, -v128 ; GCN-NEXT: v_exp_f32_e32 v97, v125 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v137 - ; GCN-NEXT: v_fma_f32 v137, s4, v98, -v128 - ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v137 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v124 - ; GCN-NEXT: v_fma_f32 v135, s4, v99, -v128 - ; GCN-NEXT: v_exp_f32_e32 v98, v138 - ; GCN-NEXT: v_exp_f32_e32 v99, v127 - ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v135 - ; GCN-NEXT: v_pack_b32_f16 v127, v136, v134 - ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728 + ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v136 + ; GCN-NEXT: v_fma_f32 v136, s4, v98, -v128 + ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v136 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v129 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[140:141], v[126:127], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v98, v134 + ; GCN-NEXT: v_fma_f32 v105, s4, v105, -v128 + ; GCN-NEXT: v_fma_f32 v104, s4, v104, -v128 + ; GCN-NEXT: v_fma_f32 v108, s4, v108, -v128 + ; GCN-NEXT: v_fma_f32 v80, s4, v80, -v128 + ; GCN-NEXT: v_fma_f32 v109, s4, v109, -v128 + ; GCN-NEXT: v_fma_f32 v81, s4, v81, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[126:127], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v124 + ; GCN-NEXT: v_fma_f32 v127, s4, v99, -v128 + ; GCN-NEXT: v_exp_f32_e32 v99, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_pack_b32_f16 v143, v135, v126 + ; GCN-NEXT: ds_read_b128 v[134:137], v171 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[138:141], v171 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[126:127], v[0:15] - ; GCN-NEXT: v_fma_f32 v131, s4, v100, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v96 - ; GCN-NEXT: v_exp_f32_e32 v100, v129 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v97 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v199, v[188:189] + ; GCN-NEXT: ds_write_b64 v175, v[168:169] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v200, v[190:191] + ; GCN-NEXT: ds_write_b64 v173, v[148:149] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v201, v[192:193] + ; GCN-NEXT: ds_write_b64 v174, v[156:157] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v202, v[194:195] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[126:127], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v101, v125 - ; GCN-NEXT: v_pack_b32_f16 v146, v130, v131 + ; GCN-NEXT: ds_write_b64 v176, v[158:159] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[130:131], v[142:143], v[32:47] + ; GCN-NEXT: v_fma_f32 v127, s4, v100, -v128 + ; GCN-NEXT: v_exp_f32_e32 v100, v129 + ; GCN-NEXT: v_fma_f32 v130, s4, v101, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v96 + ; GCN-NEXT: v_cvt_f16_f32_e32 v129, v97 + ; GCN-NEXT: v_mul_f32_e32 v159, 0x3fb8aa3b, v104 + ; GCN-NEXT: v_fma_f32 v84, s4, v84, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[150:151], v[142:143], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v101, v146 + ; GCN-NEXT: v_pack_b32_f16 v126, v126, v129 + ; GCN-NEXT: v_cvt_f16_f32_e32 v129, v98 + ; GCN-NEXT: v_fma_f32 v85, s4, v85, -v128 + ; GCN-NEXT: v_fma_f32 v88, s4, v88, -v128 + ; GCN-NEXT: v_fma_f32 v89, s4, v89, -v128 + ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[142:143], v[0:15] + ; GCN-NEXT: v_fma_f32 v134, s4, v102, -v128 + ; GCN-NEXT: v_exp_f32_e32 v102, v147 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v210, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v182, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v98 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31] - ; GCN-NEXT: v_fma_f32 v134, s4, v102, -v128 - ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v134 - ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v207, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v183, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v102, v142 - ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v208, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[150:151], v184, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v209, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v99 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[142:143], v[48:63] + ; GCN-NEXT: buffer_load_dwordx2 v[138:139], v181, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v103, v125 + ; GCN-NEXT: v_pack_b32_f16 v127, v129, v127 + ; GCN-NEXT: v_cvt_f16_f32_e32 v129, v100 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[126:127], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v99 - ; GCN-NEXT: v_fma_f32 v127, s4, v103, -v128 - ; GCN-NEXT: v_exp_f32_e32 v103, v150 - ; GCN-NEXT: v_fma_f32 v139, s4, v105, -v128 - ; GCN-NEXT: v_pack_b32_f16 v147, v147, v126 - ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v127 - ; GCN-NEXT: v_perm_b32 v152, v135, v131, s5 - ; GCN-NEXT: v_perm_b32 v154, v135, v131, s7 - ; GCN-NEXT: v_fma_f32 v135, s4, v104, -v128 - ; GCN-NEXT: v_perm_b32 v126, v134, v130, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15] - ; GCN-NEXT: v_perm_b32 v150, v134, v130, s7 - ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v100 - ; GCN-NEXT: v_exp_f32_e32 v104, v129 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135 - ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v101 - ; GCN-NEXT: ds_read_b128 v[130:133], v198 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_perm_b32 v127, v144, v142, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47] - ; GCN-NEXT: v_pack_b32_f16 v148, v134, v135 - ; GCN-NEXT: v_fma_f32 v135, s4, v106, -v128 - ; GCN-NEXT: v_exp_f32_e32 v105, v125 - ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v102 - ; GCN-NEXT: v_perm_b32 v151, v144, v142, s7 - ; GCN-NEXT: v_perm_b32 v153, v145, v143, s5 - ; GCN-NEXT: v_perm_b32 v155, v145, v143, s7 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31] - ; GCN-NEXT: v_exp_f32_e32 v106, v156 - ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v135 - ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v103 - ; GCN-NEXT: v_fma_f32 v136, s4, v107, -v128 - ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v139 - ; GCN-NEXT: v_pack_b32_f16 v149, v134, v135 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63] - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v136 - ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v107, v138 - ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[148:149], v[0:15] - ; GCN-NEXT: v_fma_f32 v131, s4, v108, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v104 - ; GCN-NEXT: v_exp_f32_e32 v108, v129 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v105 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[148:149], v[32:47] - ; GCN-NEXT: v_fma_f32 v142, s4, v109, -v128 - ; GCN-NEXT: v_exp_f32_e32 v109, v125 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v142 - ; GCN-NEXT: v_pack_b32_f16 v142, v130, v131 - ; GCN-NEXT: v_fma_f32 v131, s4, v110, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v106 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[148:149], v[16:31] - ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v107 - ; GCN-NEXT: v_exp_f32_e32 v110, v156 - ; GCN-NEXT: v_fma_f32 v135, s4, v111, -v128 - ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v135 - ; GCN-NEXT: v_pack_b32_f16 v143, v130, v131 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[148:149], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v111, v146 - ; GCN-NEXT: v_fma_f32 v139, s4, v80, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v108 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v80, v129 - ; GCN-NEXT: ds_read_b128 v[130:133], v197 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v130 + ; GCN-NEXT: v_cvt_f16_f32_e32 v104, v102 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[132:133], v[126:127], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v158, v135 + ; GCN-NEXT: ds_read_b128 v[130:133], v172 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 - ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v109 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47] - ; GCN-NEXT: v_fma_f32 v144, s4, v81, -v128 - ; GCN-NEXT: v_exp_f32_e32 v81, v125 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v144 - ; GCN-NEXT: v_pack_b32_f16 v144, v138, v139 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[142:143], v[16:31] - ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v110 - ; GCN-NEXT: v_fma_f32 v137, s4, v82, -v128 - ; GCN-NEXT: v_exp_f32_e32 v82, v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v111 - ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v137 - ; GCN-NEXT: v_fma_f32 v137, s4, v83, -v128 - ; GCN-NEXT: v_mul_f32_e32 v157, 0x3fb8aa3b, v137 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v83, v135 - ; GCN-NEXT: v_pack_b32_f16 v145, v136, v134 - ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728 + ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v134 + ; GCN-NEXT: ds_read_b128 v[142:145], v172 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v154 + ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128 + ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[152:153], v[126:127], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v152, v101 + ; GCN-NEXT: v_exp_f32_e32 v125, v125 + ; GCN-NEXT: v_fma_f32 v72, s4, v72, -v128 + ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128 + ; GCN-NEXT: v_pack_b32_f16 v152, v129, v152 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v105 + ; GCN-NEXT: v_fma_f32 v105, s4, v106, -v128 + ; GCN-NEXT: v_mul_f32_e32 v161, 0x3fb8aa3b, v105 + ; GCN-NEXT: v_cvt_f16_f32_e32 v105, v103 + ; GCN-NEXT: v_fma_f32 v106, s4, v107, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[136:137], v[126:127], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v160, v134 + ; GCN-NEXT: v_pack_b32_f16 v153, v104, v105 + ; GCN-NEXT: v_fma_f32 v73, s4, v73, -v128 + ; GCN-NEXT: v_perm_b32 v154, v148, v146, s12 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[126:127], v[48:63] + ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v106 + ; GCN-NEXT: ds_read_b128 v[104:107], v172 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_perm_b32 v156, v148, v146, s5 + ; GCN-NEXT: v_exp_f32_e32 v146, v135 + ; GCN-NEXT: ds_read_b128 v[134:137], v172 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_perm_b32 v155, v138, v150, s12 + ; GCN-NEXT: v_perm_b32 v157, v138, v150, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[130:131], v[152:153], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v158 + ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v108 + ; GCN-NEXT: v_cvt_f16_f32_e32 v108, v125 + ; GCN-NEXT: v_exp_f32_e32 v148, v159 + ; GCN-NEXT: v_perm_b32 v126, v149, v147, s12 + ; GCN-NEXT: v_perm_b32 v138, v149, v147, s5 + ; GCN-NEXT: v_pack_b32_f16 v140, v130, v108 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[142:143], v[152:153], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v129, v129 + ; GCN-NEXT: v_cvt_f16_f32_e32 v108, v146 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v109 + ; GCN-NEXT: v_fma_f32 v109, s4, v111, -v128 + ; GCN-NEXT: v_perm_b32 v127, v139, v151, s12 + ; GCN-NEXT: v_perm_b32 v139, v139, v151, s5 + ; GCN-NEXT: v_mul_f32_e32 v159, 0x3fb8aa3b, v89 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[104:105], v[152:153], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v104, v160 + ; GCN-NEXT: v_exp_f32_e32 v143, v161 + ; GCN-NEXT: v_fma_f32 v105, s4, v110, -v128 + ; GCN-NEXT: v_mul_f32_e32 v105, 0x3fb8aa3b, v105 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[134:135], v[152:153], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v147, v141 + ; GCN-NEXT: v_pack_b32_f16 v141, v104, v108 + ; GCN-NEXT: v_cvt_f16_f32_e32 v104, v148 + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v109 + ; GCN-NEXT: ds_read_b128 v[108:111], v171 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[132:133], v[140:141], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v149, v150 + ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v80 + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v129 + ; GCN-NEXT: ds_read_b128 v[130:133], v171 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pack_b32_f16 v134, v104, v80 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[144:145], v[140:141], v[16:31] + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v81 + ; GCN-NEXT: v_fma_f32 v81, s4, v82, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v143 + ; GCN-NEXT: v_mul_f32_e32 v151, 0x3fb8aa3b, v81 + ; GCN-NEXT: v_cvt_f16_f32_e32 v81, v147 + ; GCN-NEXT: v_fma_f32 v82, s4, v83, -v128 + ; GCN-NEXT: v_exp_f32_e32 v142, v142 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[106:107], v[140:141], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v145, v105 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[140:141], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v152, v135 + ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v82 + ; GCN-NEXT: v_pack_b32_f16 v135, v80, v81 + ; GCN-NEXT: ds_read_b128 v[80:83], v171 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[104:107], v171 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v199, v[126:127] + ; GCN-NEXT: ds_write_b64 v175, v[154:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v200, v[150:151] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15] + ; GCN-NEXT: ds_write_b64 v173, v[156:157] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v201, v[152:153] + ; GCN-NEXT: ds_write_b64 v174, v[126:127] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v202, v[154:155] - ; GCN-NEXT: v_fma_f32 v127, s4, v84, -v128 - ; GCN-NEXT: v_exp_f32_e32 v84, v129 - ; GCN-NEXT: v_fma_f32 v130, s4, v85, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v80 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v127 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v85, v125 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v130 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v206, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: ds_write_b64 v176, v[138:139] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[108:109], v[134:135], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v150, v150 + ; GCN-NEXT: v_cvt_f16_f32_e32 v108, v149 + ; GCN-NEXT: v_cvt_f16_f32_e32 v109, v142 + ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v84 + ; GCN-NEXT: v_cvt_f16_f32_e32 v84, v152 + ; GCN-NEXT: v_mul_f32_e32 v157, 0x3fb8aa3b, v88 + ; GCN-NEXT: v_pack_b32_f16 v126, v108, v109 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[130:131], v[134:135], v[16:31] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v178, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v81 - ; GCN-NEXT: v_pack_b32_f16 v126, v126, v127 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[144:145], v[16:31] - ; GCN-NEXT: v_fma_f32 v134, s4, v86, -v128 - ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v134 - ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v203, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[136:137], v179, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v204, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v144, v144 + ; GCN-NEXT: buffer_load_dwordx2 v[138:139], v180, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v205, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[80:81], v[134:135], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v151, v151 + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v145 + ; GCN-NEXT: v_fma_f32 v81, s4, v86, -v128 + ; GCN-NEXT: v_mul_f32_e32 v81, 0x3fb8aa3b, v81 + ; GCN-NEXT: v_pack_b32_f16 v127, v80, v84 + ; GCN-NEXT: v_mul_f32_e32 v80, 0x3fb8aa3b, v85 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[104:105], v[134:135], v[48:63] + ; GCN-NEXT: buffer_load_dwordx2 v[104:105], v177, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v82 - ; GCN-NEXT: v_exp_f32_e32 v86, v156 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v83 + ; GCN-NEXT: v_exp_f32_e32 v153, v140 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_fma_f32 v139, s4, v87, -v128 - ; GCN-NEXT: v_exp_f32_e32 v87, v157 - ; GCN-NEXT: v_pack_b32_f16 v127, v127, v138 - ; GCN-NEXT: v_fma_f32 v138, s4, v89, -v128 - ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v139 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15] - ; GCN-NEXT: ; implicit-def: $sgpr0 - ; GCN-NEXT: v_perm_b32 v154, v135, v131, s5 - ; GCN-NEXT: v_perm_b32 v156, v135, v131, s7 - ; GCN-NEXT: v_fma_f32 v135, s4, v88, -v128 - ; GCN-NEXT: v_perm_b32 v150, v134, v130, s5 - ; GCN-NEXT: v_perm_b32 v152, v134, v130, s7 - ; GCN-NEXT: ds_read_b128 v[130:133], v198 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v84 - ; GCN-NEXT: v_exp_f32_e32 v88, v129 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135 - ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v85 - ; GCN-NEXT: v_perm_b32 v151, v146, v142, s5 - ; GCN-NEXT: v_perm_b32 v153, v146, v142, s7 - ; GCN-NEXT: v_perm_b32 v155, v147, v143, s5 - ; GCN-NEXT: v_perm_b32 v157, v147, v143, s7 - ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[126:127], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v89, v125 - ; GCN-NEXT: v_pack_b32_f16 v146, v134, v135 - ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v86 - ; GCN-NEXT: v_fma_f32 v135, s4, v90, -v128 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v138 - ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v135 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[126:127], v[16:31] - ; GCN-NEXT: v_exp_f32_e32 v90, v158 - ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v64 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[126:127], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v87 - ; GCN-NEXT: v_fma_f32 v127, s4, v91, -v128 - ; GCN-NEXT: v_exp_f32_e32 v91, v139 - ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127 - ; GCN-NEXT: v_pack_b32_f16 v147, v134, v126 - ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] - ; GCN-NEXT: v_fma_f32 v130, s4, v92, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v88 - ; GCN-NEXT: v_exp_f32_e32 v92, v129 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v130 - ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v89 - ; GCN-NEXT: v_fma_f32 v131, s4, v93, -v128 - ; GCN-NEXT: v_pack_b32_f16 v130, v126, v130 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v93, v125 - ; GCN-NEXT: v_fma_f32 v126, s4, v94, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v125, v90 - ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v126 - ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v91 - ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_fma_f32 v131, s4, v95, -v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31] - ; GCN-NEXT: v_exp_f32_e32 v94, v148 - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v93 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v95, v127 - ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v92 - ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_pack_b32_f16 v131, v125, v126 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v125, v129 - ; GCN-NEXT: ds_read_b128 v[132:135], v197 + ; GCN-NEXT: v_fma_f32 v134, s4, v87, -v128 + ; GCN-NEXT: ds_read_b128 v[84:87], v172 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v150 + ; GCN-NEXT: v_mul_f32_e32 v155, 0x3fb8aa3b, v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[110:111], v[126:127], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v154, v141 + ; GCN-NEXT: ds_read_b128 v[108:111], v172 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[130:131], v[32:47] - ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v65 + ; GCN-NEXT: ; implicit-def: $sgpr0 + ; GCN-NEXT: v_perm_b32 v134, v136, v130, s12 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[132:133], v[126:127], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v156, v80 + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v151 + ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v144 + ; GCN-NEXT: v_perm_b32 v140, v136, v130, s5 + ; GCN-NEXT: v_pack_b32_f16 v132, v135, v132 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[82:83], v[126:127], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v161, v81 + ; GCN-NEXT: v_cvt_f16_f32_e32 v81, v153 + ; GCN-NEXT: v_fma_f32 v82, s4, v90, -v128 + ; GCN-NEXT: v_mul_f32_e32 v162, 0x3fb8aa3b, v82 + ; GCN-NEXT: v_fma_f32 v82, s4, v91, -v128 + ; GCN-NEXT: v_pack_b32_f16 v133, v80, v81 + ; GCN-NEXT: v_perm_b32 v135, v104, v138, s12 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[106:107], v[126:127], v[48:63] + ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v82 + ; GCN-NEXT: ds_read_b128 v[80:83], v172 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v126, v155 + ; GCN-NEXT: ds_read_b128 v[88:91], v172 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_perm_b32 v141, v104, v138, s5 + ; GCN-NEXT: v_perm_b32 v106, v137, v131, s12 + ; GCN-NEXT: v_perm_b32 v104, v137, v131, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[84:85], v[132:133], v[32:47] + ; GCN-NEXT: v_fma_f32 v85, s4, v92, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v84, v154 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v85 + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v156 + ; GCN-NEXT: v_exp_f32_e32 v130, v157 + ; GCN-NEXT: v_fma_f32 v92, s4, v93, -v128 + ; GCN-NEXT: v_mul_f32_e32 v137, 0x3fb8aa3b, v92 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[108:109], v[132:133], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v131, v159 + ; GCN-NEXT: v_pack_b32_f16 v108, v84, v85 + ; GCN-NEXT: v_cvt_f16_f32_e32 v84, v126 + ; GCN-NEXT: v_fma_f32 v85, s4, v95, -v128 + ; GCN-NEXT: v_perm_b32 v107, v105, v139, s12 + ; GCN-NEXT: v_perm_b32 v105, v105, v139, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[80:81], v[132:133], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v161 + ; GCN-NEXT: v_exp_f32_e32 v138, v162 + ; GCN-NEXT: v_fma_f32 v81, s4, v94, -v128 + ; GCN-NEXT: v_mul_f32_e32 v81, 0x3fb8aa3b, v81 + ; GCN-NEXT: v_pack_b32_f16 v109, v80, v84 + ; GCN-NEXT: v_cvt_f16_f32_e32 v80, v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[88:89], v[132:133], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v127, v127 + ; GCN-NEXT: v_mul_f32_e32 v133, 0x3fb8aa3b, v64 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v131 + ; GCN-NEXT: v_mul_f32_e32 v89, 0x3fb8aa3b, v85 + ; GCN-NEXT: v_pack_b32_f16 v88, v80, v64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[86:87], v[108:109], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v132, v136 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v138 + ; GCN-NEXT: ds_read_b128 v[84:87], v171 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[92:95], v171 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[110:111], v[108:109], v[16:31] + ; GCN-NEXT: v_mul_f32_e32 v111, 0x3fb8aa3b, v65 ; GCN-NEXT: v_fma_f32 v65, s4, v66, -v128 - ; GCN-NEXT: v_exp_f32_e32 v126, v142 - ; GCN-NEXT: v_pack_b32_f16 v142, v127, v64 - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v94 - ; GCN-NEXT: v_mul_f32_e32 v145, 0x3fb8aa3b, v65 - ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v95 + ; GCN-NEXT: v_exp_f32_e32 v110, v137 + ; GCN-NEXT: v_mul_f32_e32 v137, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v127 ; GCN-NEXT: v_fma_f32 v66, s4, v67, -v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[130:131], v[16:31] - ; GCN-NEXT: v_exp_f32_e32 v127, v143 - ; GCN-NEXT: v_pack_b32_f16 v143, v64, v65 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[130:131], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v129, v138 - ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v66 - ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:1152 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[82:83], v[108:109], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v136, v81 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[90:91], v[108:109], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v108, v89 + ; GCN-NEXT: v_mul_f32_e32 v91, 0x3fb8aa3b, v66 + ; GCN-NEXT: v_pack_b32_f16 v89, v64, v65 + ; GCN-NEXT: ds_read_b128 v[64:67], v171 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[136:139], v197 offset:1728 + ; GCN-NEXT: ds_read_b128 v[80:83], v171 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v199, v[150:151] + ; GCN-NEXT: ds_write_b64 v175, v[134:135] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[84:85], v[88:89], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v84, v132 + ; GCN-NEXT: v_exp_f32_e32 v109, v133 + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v110 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v200, v[152:153] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] - ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v125 - ; GCN-NEXT: v_exp_f32_e32 v130, v158 + ; GCN-NEXT: ds_write_b64 v173, v[140:141] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v201, v[154:155] + ; GCN-NEXT: ds_write_b64 v174, v[106:107] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v202, v[156:157] + ; GCN-NEXT: ds_write_b64 v176, v[104:105] + ; GCN-NEXT: v_mul_f32_e32 v104, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[92:93], v[88:89], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v92, v111 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v108 + ; GCN-NEXT: v_pack_b32_f16 v90, v84, v85 + ; GCN-NEXT: v_mul_f32_e32 v105, 0x3fb8aa3b, v69 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[142:143], v[32:47] - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68 - ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v126 - ; GCN-NEXT: v_exp_f32_e32 v131, v144 - ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v69 ; GCN-NEXT: v_fma_f32 v69, s4, v71, -v128 - ; GCN-NEXT: v_pack_b32_f16 v140, v132, v68 - ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v129 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31] - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v127 - ; GCN-NEXT: v_exp_f32_e32 v132, v145 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[64:65], v[88:89], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v136 + ; GCN-NEXT: v_exp_f32_e32 v93, v137 ; GCN-NEXT: v_fma_f32 v65, s4, v70, -v128 ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65 - ; GCN-NEXT: v_fma_f32 v145, s4, v73, -v128 - ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v145 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v133, v141 - ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v69 - ; GCN-NEXT: v_pack_b32_f16 v141, v64, v68 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[68:71], v198 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v143, s4, v72, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v130 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[140:141], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v72, v146 - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v143, v131 - ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pack_b32_f16 v64, v64, v143 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v73, v144 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[140:141], v[16:31] - ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v132 - ; GCN-NEXT: v_fma_f32 v67, s4, v74, -v128 - ; GCN-NEXT: v_exp_f32_e32 v74, v65 - ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v133 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_pack_b32_f16 v65, v66, v65 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[140:141], v[48:63] - ; GCN-NEXT: v_fma_f32 v138, s4, v75, -v128 - ; GCN-NEXT: v_exp_f32_e32 v75, v142 - ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v138 - ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v72 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] - ; GCN-NEXT: v_fma_f32 v68, s4, v76, -v128 - ; GCN-NEXT: v_exp_f32_e32 v76, v146 - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68 - ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v73 - ; GCN-NEXT: v_fma_f32 v69, s4, v77, -v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v77, v147 - ; GCN-NEXT: v_pack_b32_f16 v134, v66, v68 - ; GCN-NEXT: v_fma_f32 v68, s4, v78, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v74 - ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v69 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31] - ; GCN-NEXT: v_exp_f32_e32 v78, v67 - ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v68 - ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v76 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[64:65], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v75 - ; GCN-NEXT: v_fma_f32 v65, s4, v79, -v128 - ; GCN-NEXT: v_exp_f32_e32 v79, v148 - ; GCN-NEXT: v_mul_f32_e32 v128, 0x3fb8aa3b, v65 - ; GCN-NEXT: v_pack_b32_f16 v135, v66, v64 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v142, v146 - ; GCN-NEXT: ds_read_b128 v[68:71], v197 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v137, v147 - ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v77 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] - ; GCN-NEXT: v_exp_f32_e32 v138, v138 - ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v78 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63] - ; GCN-NEXT: s_nop 10 - ; GCN-NEXT: v_exp_f32_e32 v52, v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v137 - ; GCN-NEXT: v_cvt_f16_f32_e32 v51, v142 - ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v138 - ; GCN-NEXT: v_cvt_f16_f32_e32 v53, v52 - ; GCN-NEXT: v_cvt_f16_f32_e32 v49, v79 - ; GCN-NEXT: v_pack_b32_f16 v50, v51, v50 - ; GCN-NEXT: v_pack_b32_f16 v48, v139, v136 - ; GCN-NEXT: v_pack_b32_f16 v51, v54, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[80:81], v[88:89], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v88, v91 + ; GCN-NEXT: v_pack_b32_f16 v91, v64, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v109 + ; GCN-NEXT: v_mul_f32_e32 v81, 0x3fb8aa3b, v69 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[68:71], v172 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[86:87], v[90:91], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v89, v104 + ; GCN-NEXT: v_mul_f32_e32 v104, 0x3fb8aa3b, v72 + ; GCN-NEXT: v_cvt_f16_f32_e32 v72, v92 + ; GCN-NEXT: ds_read_b128 v[84:87], v172 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pack_b32_f16 v80, v64, v72 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[94:95], v[90:91], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v94, v105 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v93 + ; GCN-NEXT: v_mul_f32_e32 v95, 0x3fb8aa3b, v73 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[90:91], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v105, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v88 + ; GCN-NEXT: v_fma_f32 v66, s4, v74, -v128 + ; GCN-NEXT: v_mul_f32_e32 v106, 0x3fb8aa3b, v66 + ; GCN-NEXT: v_fma_f32 v66, s4, v75, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[82:83], v[90:91], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v90, v81 + ; GCN-NEXT: v_mul_f32_e32 v91, 0x3fb8aa3b, v66 + ; GCN-NEXT: v_pack_b32_f16 v81, v64, v65 + ; GCN-NEXT: ds_read_b128 v[64:67], v172 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[72:75], v172 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[80:81], v[32:47] + ; GCN-NEXT: v_fma_f32 v69, s4, v76, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v89 + ; GCN-NEXT: v_mul_f32_e32 v107, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v94 + ; GCN-NEXT: v_exp_f32_e32 v104, v104 + ; GCN-NEXT: v_fma_f32 v76, s4, v77, -v128 + ; GCN-NEXT: v_pack_b32_f16 v82, v68, v69 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[80:81], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v90 + ; GCN-NEXT: v_fma_f32 v72, s4, v79, -v128 + ; GCN-NEXT: v_cvt_f16_f32_e32 v73, v104 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[64:65], v[80:81], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v105 + ; GCN-NEXT: v_fma_f32 v65, s4, v78, -v128 + ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_exp_f32_e32 v65, v65 + ; GCN-NEXT: v_pack_b32_f16 v83, v64, v68 + ; GCN-NEXT: v_exp_f32_e32 v64, v91 + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[82:83], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[80:81], v[16:31] + ; GCN-NEXT: v_mul_f32_e32 v85, 0x3fb8aa3b, v76 + ; GCN-NEXT: s_nop 8 + ; GCN-NEXT: v_mul_f32_e32 v49, 0x3fb8aa3b, v72 + ; GCN-NEXT: v_exp_f32_e32 v80, v107 + ; GCN-NEXT: v_exp_f32_e32 v85, v85 + ; GCN-NEXT: v_exp_f32_e32 v52, v49 + ; GCN-NEXT: v_cvt_f16_f32_e32 v51, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v80 + ; GCN-NEXT: v_cvt_f16_f32_e32 v53, v85 + ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v52 + ; GCN-NEXT: v_exp_f32_e32 v84, v95 + ; GCN-NEXT: v_exp_f32_e32 v95, v106 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[82:83], v[32:47] + ; GCN-NEXT: v_pack_b32_f16 v51, v51, v50 + ; GCN-NEXT: v_pack_b32_f16 v50, v54, v53 ; GCN-NEXT: v_add_f32_e32 v53, 0, v113 ; GCN-NEXT: v_add_f32_e32 v53, v114, v53 ; GCN-NEXT: v_add_f32_e32 v53, v115, v53 @@ -1079,74 +1067,87 @@ ; GCN-NEXT: v_add_f32_e32 v53, v101, v53 ; GCN-NEXT: v_add_f32_e32 v53, v102, v53 ; GCN-NEXT: v_add_f32_e32 v53, v103, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v104, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v105, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v106, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v107, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v108, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v109, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v110, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v111, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v80, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v81, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v82, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v83, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v84, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v85, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v86, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v87, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v88, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v89, v53 - ; GCN-NEXT: v_pack_b32_f16 v49, v140, v49 - ; GCN-NEXT: v_add_f32_e32 v53, v90, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v91, v53 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[48:49], v[0:15] - ; GCN-NEXT: v_add_f32_e32 v53, v92, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v93, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v94, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v95, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v158, v53 ; GCN-NEXT: v_add_f32_e32 v53, v125, v53 + ; GCN-NEXT: ds_read_b128 v[68:71], v171 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[76:79], v171 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_add_f32_e32 v53, v160, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[82:83], v[16:31] + ; GCN-NEXT: v_add_f32_e32 v53, v146, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v148, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v129, v53 + ; GCN-NEXT: v_cvt_f16_f32_e32 v81, v84 + ; GCN-NEXT: v_cvt_f16_f32_e32 v48, v64 + ; GCN-NEXT: v_add_f32_e32 v53, v143, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v147, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[82:83], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v95 + ; GCN-NEXT: v_add_f32_e32 v53, v149, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v142, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v145, v53 + ; GCN-NEXT: v_pack_b32_f16 v49, v66, v48 + ; GCN-NEXT: v_pack_b32_f16 v48, v73, v81 + ; GCN-NEXT: v_add_f32_e32 v53, v152, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v150, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[76:77], v[48:49], v[16:31] + ; GCN-NEXT: v_add_f32_e32 v53, v144, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v151, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v153, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v154, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v156, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v161, v53 ; GCN-NEXT: v_add_f32_e32 v53, v126, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[78:79], v[50:51], v[16:31] + ; GCN-NEXT: v_add_f32_e32 v53, v130, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v131, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v138, v53 ; GCN-NEXT: v_add_f32_e32 v53, v127, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v129, v53 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[50:51], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[48:49], v[32:47] - ; GCN-NEXT: s_nop 9 - ; GCN-NEXT: v_add_f32_e32 v0, v130, v53 - ; GCN-NEXT: v_add_f32_e32 v0, v131, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v132, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v133, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v72, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v73, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v74, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v75, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v76, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v77, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v78, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v79, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v142, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v137, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v138, v0 - ; GCN-NEXT: v_add_f32_e32 v4, v52, v0 - ; GCN-NEXT: ds_bpermute_b32 v5, v196, v4 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[48:49], v[16:31] - ; GCN-NEXT: v_add_f32_e32 v2, v4, v5 - ; GCN-NEXT: ds_bpermute_b32 v3, v196, v2 - ; GCN-NEXT: ; implicit-def: $vgpr4 + ; GCN-NEXT: v_add_f32_e32 v53, v132, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v110, v53 + ; GCN-NEXT: v_add_f32_e32 v53, v136, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[48:49], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v53, v108, v53 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[50:51], v[32:47] + ; GCN-NEXT: s_nop 10 + ; GCN-NEXT: v_add_f32_e32 v32, v109, v53 + ; GCN-NEXT: v_add_f32_e32 v32, v92, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v93, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v88, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v89, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v94, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v105, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v90, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v104, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v84, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v95, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v64, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v80, v32 + ; GCN-NEXT: v_add_f32_e32 v32, v85, v32 + ; GCN-NEXT: v_add_f32_e32 v16, v65, v32 + ; GCN-NEXT: v_add_f32_e32 v20, v52, v16 + ; GCN-NEXT: ds_bpermute_b32 v21, v170, v20 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: ds_read_b128 v[16:19], v171 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[48:49], v[0:15] + ; GCN-NEXT: v_add_f32_e32 v18, v20, v21 + ; GCN-NEXT: ds_bpermute_b32 v19, v170, v18 + ; GCN-NEXT: ; implicit-def: $vgpr20 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[12:13] - ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v112 - ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1728 + ; GCN-NEXT: s_nop 7 + ; GCN-NEXT: v_cndmask_b32_e64 v0, v19, v18, s[6:7] + ; GCN-NEXT: v_fmac_f32_e32 v0, v20, v112 + ; GCN-NEXT: ds_read_b128 v[0:3], v171 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[50:51], v[32:47] ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir index 0887fdf0844b0..372bdb0d67452 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir @@ -8,486 +8,486 @@ ; GCN: ; %bb.0: ; GCN-NEXT: ; implicit-def: $vgpr2 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) - ; GCN-NEXT: v_readfirstlane_b32 s20, v2 - ; GCN-NEXT: ; implicit-def: $sgpr4 + ; GCN-NEXT: v_readfirstlane_b32 s4, v2 ; GCN-NEXT: ; implicit-def: $vgpr3 + ; GCN-NEXT: ; implicit-def: $sgpr5 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; implicit-def: $vgpr50 - ; GCN-NEXT: ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19 - ; GCN-NEXT: ; implicit-def: $vgpr49 - ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 - ; GCN-NEXT: ; implicit-def: $vgpr51 - ; GCN-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65 - ; GCN-NEXT: ; implicit-def: $vgpr76 - ; GCN-NEXT: ; implicit-def: $vgpr77 - ; GCN-NEXT: ; implicit-def: $vgpr78 - ; GCN-NEXT: ; implicit-def: $vgpr79 - ; GCN-NEXT: ; implicit-def: $vgpr80 - ; GCN-NEXT: ; implicit-def: $vgpr91 - ; GCN-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19 + ; GCN-NEXT: ; implicit-def: $vgpr54 + ; GCN-NEXT: ; implicit-def: $vgpr53 + ; GCN-NEXT: ; implicit-def: $vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN-NEXT: ; implicit-def: $vgpr55 + ; GCN-NEXT: ; implicit-def: $sgpr17 + ; GCN-NEXT: ; implicit-def: $sgpr16 + ; GCN-NEXT: ; implicit-def: $vgpr86 + ; GCN-NEXT: ; implicit-def: $vgpr87 + ; GCN-NEXT: v_max_f32_e32 v88, v87, v87 + ; GCN-NEXT: ; implicit-def: $vgpr89 + ; GCN-NEXT: ; implicit-def: $vgpr90 ; GCN-NEXT: ; iglp_opt mask(0x00000002) - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_lshl_add_u32 v2, s20, 4, v3 - ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1] + ; GCN-NEXT: s_nop 0 + ; GCN-NEXT: v_lshl_add_u32 v2, s4, 4, v3 + ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s5, v2, v[0:1] ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: s_lshl_b32 s4, s20, 7 + ; GCN-NEXT: s_lshl_b32 s5, s4, 7 ; GCN-NEXT: ; implicit-def: $vgpr5 - ; GCN-NEXT: v_add_lshl_u32 v48, v5, s4, 1 - ; GCN-NEXT: v_add_u32_e32 v76, s20, v76 - ; GCN-NEXT: v_and_b32_e32 v76, 0x1fffffff, v76 + ; GCN-NEXT: v_add_lshl_u32 v52, v5, s5, 1 + ; GCN-NEXT: ; implicit-def: $sgpr5 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v48, v[0:3] + ; GCN-NEXT: ds_write_b128 v52, v[0:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[32:35], v4, s[0:3], 0 offen offset:64 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: ; implicit-def: $vgpr1 - ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: ; implicit-def: $sgpr6 - ; GCN-NEXT: v_add_u32_e32 v0, v0, v50 - ; GCN-NEXT: v_add_u32_e32 v1, v1, v50 - ; GCN-NEXT: buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: v_add_u32_e32 v0, v0, v54 + ; GCN-NEXT: v_add_u32_e32 v1, v1, v54 + ; GCN-NEXT: buffer_load_dwordx2 v[40:41], v0, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[74:75], v1, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[42:43], v1, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[36:39], v49 + ; GCN-NEXT: ds_read_b128 v[36:39], v53 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[44:47], v49 offset:512 + ; GCN-NEXT: ds_read_b128 v[48:51], v53 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0 ; GCN-NEXT: ; kill: killed $vgpr1 ; GCN-NEXT: ; kill: killed $vgpr0 - ; GCN-NEXT: v_mul_lo_u32 v76, v76, s6 - ; GCN-NEXT: v_add_lshl_u32 v76, v77, v76, 1 - ; GCN-NEXT: v_lshl_add_u32 v77, v78, 1, v76 - ; GCN-NEXT: ; implicit-def: $sgpr5 - ; GCN-NEXT: v_lshl_add_u32 v78, v79, 1, v77 - ; GCN-NEXT: ; implicit-def: $sgpr2 - ; GCN-NEXT: ; implicit-def: $sgpr3 - ; GCN-NEXT: v_lshl_add_u32 v79, v80, 1, v78 - ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] - ; GCN-NEXT: ds_read_b128 v[36:39], v51 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[44:45], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[48:49], v[44:45], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[46:47], v[16:31] + ; GCN-NEXT: ds_read_b128 v[36:39], v55 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15] - ; GCN-NEXT: ds_read_b128 v[44:47], v51 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[50:51], v[46:47], v[0:15] + ; GCN-NEXT: ds_read_b128 v[48:51], v55 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 + ; GCN-NEXT: ; implicit-def: $vgpr44_vgpr45_vgpr46_vgpr47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v48, v[32:35] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], v[16:31] + ; GCN-NEXT: ds_write_b128 v52, v[32:35] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[44:45], v[16:31] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[32:35], v49 + ; GCN-NEXT: ds_read_b128 v[32:35], v53 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[48:49], v[44:45], v[0:15] + ; GCN-NEXT: ; implicit-def: $vgpr48 + ; GCN-NEXT: ; implicit-def: $vgpr49 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[46:47], v[16:31] ; GCN-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15] - ; GCN-NEXT: ds_read_b128 v[40:43], v49 offset:512 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[68:71], v51 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[50:51], v[46:47], v[0:15] + ; GCN-NEXT: ds_read_b128 v[44:47], v53 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ; implicit-def: $vgpr50 + ; GCN-NEXT: ; implicit-def: $vgpr51 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[36:37], v[0:15] + ; GCN-NEXT: ; implicit-def: $vgpr44 + ; GCN-NEXT: ; implicit-def: $vgpr45 + ; GCN-NEXT: v_add_u32_e32 v52, v45, v54 + ; GCN-NEXT: v_add_u32_e32 v44, s4, v44 + ; GCN-NEXT: v_and_b32_e32 v44, 0x1fffffff, v44 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31] - ; GCN-NEXT: ; implicit-def: $vgpr32 - ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: v_add_u32_e32 v82, v32, v50 - ; GCN-NEXT: v_add_u32_e32 v83, v33, v50 - ; GCN-NEXT: ; kill: killed $vgpr82 - ; GCN-NEXT: ; kill: killed $vgpr83 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[38:39], v[0:15] + ; GCN-NEXT: ; implicit-def: $vgpr46 + ; GCN-NEXT: v_add_u32_e32 v53, v46, v54 + ; GCN-NEXT: v_mul_lo_u32 v54, v44, s5 + ; GCN-NEXT: v_add_lshl_u32 v48, v48, v54, 1 + ; GCN-NEXT: v_lshl_add_u32 v49, v49, 1, v48 + ; GCN-NEXT: v_lshl_add_u32 v84, v50, 1, v49 + ; GCN-NEXT: v_lshl_add_u32 v85, v51, 1, v84 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31] - ; GCN-NEXT: ds_read_b128 v[66:69], v51 offset:512 + ; GCN-NEXT: ds_read_b128 v[36:39], v55 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[44:47], v55 offset:512 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[38:39], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $vgpr67 - ; GCN-NEXT: v_max_f32_e32 v81, v67, v67 - ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31] - ; GCN-NEXT: v_perm_b32 v70, v74, v72, s2 - ; GCN-NEXT: v_perm_b32 v71, v74, v72, s3 - ; GCN-NEXT: v_perm_b32 v72, v75, v73, s2 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[32:33], v[16:31] + ; GCN-NEXT: v_perm_b32 v36, v42, v40, s17 + ; GCN-NEXT: v_perm_b32 v37, v42, v40, s16 + ; GCN-NEXT: v_perm_b32 v40, v43, v41, s17 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v76, v70 + ; GCN-NEXT: ds_write_b32 v48, v36 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v77, v71 + ; GCN-NEXT: ds_write_b32 v49, v37 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v72 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v20 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v64, s4, v16 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v17 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19 - ; GCN-NEXT: v_max3_f32 v64, v64, s5, v65 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v21 - ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v22 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v23 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v24 - ; GCN-NEXT: v_mul_f32_e32 v87, s4, v25 - ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v26 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v27 - ; GCN-NEXT: v_max3_f32 v64, v64, v86, v87 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v28 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v29 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v30 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v31 - ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v0 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v1 - ; GCN-NEXT: v_max3_f32 v64, v64, v80, v84 - ; GCN-NEXT: v_mul_f32_e32 v87, s4, v2 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v3 - ; GCN-NEXT: v_max3_f32 v64, v64, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v4 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v5 - ; GCN-NEXT: v_max3_f32 v64, v64, v87, v65 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v6 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v7 - ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v8 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v9 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v10 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v11 - ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 - ; GCN-NEXT: v_mul_f32_e32 v87, s4, v12 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v13 - ; GCN-NEXT: v_max3_f32 v64, v64, v86, v65 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v14 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v15 - ; GCN-NEXT: v_max3_f32 v64, v64, v87, v68 - ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 - ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 - ; GCN-NEXT: v_perm_b32 v68, v75, v73, s3 + ; GCN-NEXT: ds_write_b32 v84, v40 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[32:33], v[0:15] + ; GCN-NEXT: v_perm_b32 v32, v43, v41, s16 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v68 - ; GCN-NEXT: ; implicit-def: $vgpr84 - ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v70, v64, v65 + ; GCN-NEXT: ds_write_b32 v85, v32 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[80:81], v52, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[82:83], v53, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_bpermute_b32 v71, v66, v70 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: ; implicit-def: $sgpr0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: ; implicit-def: $sgpr2 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[34:35], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[34:35], v[0:15] + ; GCN-NEXT: s_nop 9 + ; GCN-NEXT: v_mul_f32_e32 v32, s4, v16 + ; GCN-NEXT: v_mul_f32_e32 v33, s4, v17 + ; GCN-NEXT: v_mul_f32_e32 v34, s4, v18 + ; GCN-NEXT: v_mul_f32_e32 v35, s4, v19 + ; GCN-NEXT: v_max3_f32 v32, v32, s0, v33 + ; GCN-NEXT: v_mul_f32_e32 v36, s4, v20 + ; GCN-NEXT: v_mul_f32_e32 v37, s4, v21 + ; GCN-NEXT: v_max3_f32 v32, v32, v34, v35 + ; GCN-NEXT: v_mul_f32_e32 v38, s4, v22 + ; GCN-NEXT: v_mul_f32_e32 v39, s4, v23 + ; GCN-NEXT: v_max3_f32 v32, v32, v36, v37 + ; GCN-NEXT: v_mul_f32_e32 v40, s4, v24 + ; GCN-NEXT: v_mul_f32_e32 v41, s4, v25 + ; GCN-NEXT: v_max3_f32 v32, v32, v38, v39 + ; GCN-NEXT: v_mul_f32_e32 v42, s4, v26 + ; GCN-NEXT: v_mul_f32_e32 v43, s4, v27 + ; GCN-NEXT: v_max3_f32 v32, v32, v40, v41 + ; GCN-NEXT: v_mul_f32_e32 v44, s4, v28 + ; GCN-NEXT: v_mul_f32_e32 v45, s4, v29 + ; GCN-NEXT: v_max3_f32 v32, v32, v42, v43 + ; GCN-NEXT: v_mul_f32_e32 v46, s4, v30 + ; GCN-NEXT: v_mul_f32_e32 v47, s4, v31 + ; GCN-NEXT: v_max3_f32 v32, v32, v44, v45 + ; GCN-NEXT: v_mul_f32_e32 v50, s4, v0 + ; GCN-NEXT: v_mul_f32_e32 v51, s4, v1 + ; GCN-NEXT: v_max3_f32 v32, v32, v46, v47 + ; GCN-NEXT: v_mul_f32_e32 v52, s4, v2 + ; GCN-NEXT: v_mul_f32_e32 v53, s4, v3 + ; GCN-NEXT: v_max3_f32 v32, v32, v50, v51 + ; GCN-NEXT: v_mul_f32_e32 v54, s4, v4 + ; GCN-NEXT: v_mul_f32_e32 v55, s4, v5 + ; GCN-NEXT: v_max3_f32 v32, v32, v52, v53 + ; GCN-NEXT: v_mul_f32_e32 v56, s4, v6 + ; GCN-NEXT: v_mul_f32_e32 v57, s4, v7 + ; GCN-NEXT: v_max3_f32 v32, v32, v54, v55 + ; GCN-NEXT: v_mul_f32_e32 v58, s4, v8 + ; GCN-NEXT: v_mul_f32_e32 v59, s4, v9 + ; GCN-NEXT: v_max3_f32 v32, v32, v56, v57 + ; GCN-NEXT: v_mul_f32_e32 v60, s4, v10 + ; GCN-NEXT: v_mul_f32_e32 v61, s4, v11 + ; GCN-NEXT: v_max3_f32 v32, v32, v58, v59 + ; GCN-NEXT: v_mul_f32_e32 v62, s4, v12 + ; GCN-NEXT: v_mul_f32_e32 v63, s4, v13 + ; GCN-NEXT: v_max3_f32 v32, v32, v60, v61 + ; GCN-NEXT: v_mul_f32_e32 v64, s4, v14 + ; GCN-NEXT: v_mul_f32_e32 v65, s4, v15 + ; GCN-NEXT: v_max3_f32 v32, v32, v62, v63 + ; GCN-NEXT: v_max3_f32 v50, v32, v64, v65 + ; GCN-NEXT: ds_bpermute_b32 v51, v86, v50 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v70, v71, v70, s[0:1] - ; GCN-NEXT: v_max_f32_e32 v70, v70, v70 - ; GCN-NEXT: v_max_f32_e32 v72, v81, v70 - ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v72 - ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v72 - ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v72 - ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16 - ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19 - ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72 - ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v72 - ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v72 - ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v72 - ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v72 - ; GCN-NEXT: v_exp_f32_e32 v73, v16 - ; GCN-NEXT: v_exp_f32_e32 v74, v18 - ; GCN-NEXT: v_exp_f32_e32 v75, v19 - ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20 - ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 - ; GCN-NEXT: v_exp_f32_e32 v80, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v73 - ; GCN-NEXT: v_fma_f32 v18, s4, v24, -v72 - ; GCN-NEXT: v_exp_f32_e32 v81, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v74 - ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72 - ; GCN-NEXT: v_exp_f32_e32 v82, v22 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v75 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72 - ; GCN-NEXT: v_pack_b32_f16 v71, v21, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_sub_f32_e32 v24, v67, v72 - ; GCN-NEXT: v_exp_f32_e32 v83, v23 - ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v72 - ; GCN-NEXT: v_exp_f32_e32 v85, v22 - ; GCN-NEXT: v_exp_f32_e32 v17, v17 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17 - ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72 - ; GCN-NEXT: v_exp_f32_e32 v88, v23 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72 - ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19 - ; GCN-NEXT: ds_read_b128 v[18:21], v84 + ; GCN-NEXT: ds_read_b128 v[64:67], v89 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v16, v24 - ; GCN-NEXT: ds_read_b128 v[22:25], v84 offset:576 + ; GCN-NEXT: ds_read_b128 v[68:71], v89 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v18, 0, v73 - ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83 - ; GCN-NEXT: v_fma_f32 v73, s4, v28, -v72 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v80 - ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72 - ; GCN-NEXT: v_perm_b32 v90, v69, v65, s2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v17, v18 - ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81 - ; GCN-NEXT: v_fma_f32 v23, s4, v30, -v72 - ; GCN-NEXT: v_exp_f32_e32 v30, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v82 - ; GCN-NEXT: v_fma_f32 v18, s4, v31, -v72 - ; GCN-NEXT: v_perm_b32 v31, v68, v64, s2 - ; GCN-NEXT: v_perm_b32 v64, v68, v64, s3 - ; GCN-NEXT: v_perm_b32 v65, v69, v65, s3 - ; GCN-NEXT: ds_read_b128 v[26:29], v91 + ; GCN-NEXT: ds_read_b128 v[72:75], v90 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[68:71], v91 offset:576 + ; GCN-NEXT: ds_read_b128 v[76:79], v90 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_max_f32_e32 v51, v51, v51 + ; GCN-NEXT: v_max_f32_e32 v50, v50, v51 + ; GCN-NEXT: ds_bpermute_b32 v51, v86, v50 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v91, v51, v50, s[0:1] + ; GCN-NEXT: v_perm_b32 v50, v82, v80, s17 + ; GCN-NEXT: v_perm_b32 v51, v82, v80, s16 + ; GCN-NEXT: v_max_f32_e32 v80, v91, v91 + ; GCN-NEXT: v_max_f32_e32 v80, v88, v80 + ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v80 + ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v80 + ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v80 + ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16 + ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v80 + ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v80 + ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v80 + ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v80 + ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v80 + ; GCN-NEXT: v_sub_f32_e32 v82, v87, v80 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 + ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19 + ; GCN-NEXT: v_exp_f32_e32 v87, v16 + ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18 + ; GCN-NEXT: v_exp_f32_e32 v88, v17 + ; GCN-NEXT: v_exp_f32_e32 v92, v19 + ; GCN-NEXT: v_exp_f32_e32 v91, v18 + ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20 + ; GCN-NEXT: v_perm_b32 v16, v83, v81, s17 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v76, v31 - ; GCN-NEXT: v_mul_f32_e32 v31, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_exp_f32_e32 v31, v31 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_pack_b32_f16 v18, v19, v86 - ; GCN-NEXT: v_pack_b32_f16 v19, v22, v89 + ; GCN-NEXT: ds_write_b32 v48, v50 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v77, v64 + ; GCN-NEXT: ds_write_b32 v49, v51 + ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 + ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v90 + ; GCN-NEXT: ds_write_b32 v84, v16 + ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v87 + ; GCN-NEXT: v_fma_f32 v17, s4, v24, -v80 + ; GCN-NEXT: v_exp_f32_e32 v84, v20 + ; GCN-NEXT: v_perm_b32 v20, v83, v81, s16 + ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21 + ; GCN-NEXT: v_mul_f32_e32 v82, 0x3fb8aa3b, v82 + ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v88 + ; GCN-NEXT: v_fma_f32 v19, s4, v25, -v80 + ; GCN-NEXT: v_exp_f32_e32 v81, v22 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v65 - ; GCN-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v73 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v87 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v74, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v85 - ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72 - ; GCN-NEXT: v_exp_f32_e32 v22, v64 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v88 - ; GCN-NEXT: v_exp_f32_e32 v64, v65 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v75, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 - ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v72 - ; GCN-NEXT: v_exp_f32_e32 v23, v23 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v31 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v0 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v1 - ; GCN-NEXT: v_pack_b32_f16 v0, v20, v21 - ; GCN-NEXT: v_pack_b32_f16 v1, v18, v19 - ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72 - ; GCN-NEXT: v_exp_f32_e32 v25, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v80, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 - ; GCN-NEXT: v_fma_f32 v26, s4, v4, -v72 - ; GCN-NEXT: v_exp_f32_e32 v27, v3 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v64 - ; GCN-NEXT: v_fma_f32 v67, s4, v5, -v72 - ; GCN-NEXT: v_exp_f32_e32 v65, v65 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47] + ; GCN-NEXT: ds_write_b32 v85, v20 + ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v92 + ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v80 + ; GCN-NEXT: v_exp_f32_e32 v83, v23 + ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v17 + ; GCN-NEXT: v_fma_f32 v22, s4, v27, -v80 + ; GCN-NEXT: v_exp_f32_e32 v25, v21 + ; GCN-NEXT: v_exp_f32_e32 v24, v82 + ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v80 + ; GCN-NEXT: v_exp_f32_e32 v82, v23 + ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v91 + ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; GCN-NEXT: v_pack_b32_f16 v16, v16, v18 + ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_pack_b32_f16 v17, v21, v20 + ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19 + ; GCN-NEXT: v_mul_f32_e32 v26, 0x3fb8aa3b, v26 + ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[24:25] op_sel_hi:[1,0] + ; GCN-NEXT: v_fma_f32 v29, s4, v29, -v80 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[16:17], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v64, 0, v87 + ; GCN-NEXT: v_fma_f32 v30, s4, v30, -v80 + ; GCN-NEXT: v_exp_f32_e32 v87, v19 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v84 + ; GCN-NEXT: v_fma_f32 v31, s4, v31, -v80 + ; GCN-NEXT: v_mul_f32_e32 v27, 0x3fb8aa3b, v22 + ; GCN-NEXT: v_mul_f32_e32 v28, 0x3fb8aa3b, v28 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[16:17], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v64, v88, v64 + ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v80 + ; GCN-NEXT: v_exp_f32_e32 v69, v26 + ; GCN-NEXT: v_cvt_f16_f32_e32 v85, v25 + ; GCN-NEXT: v_mul_f32_e32 v29, 0x3fb8aa3b, v29 + ; GCN-NEXT: v_mul_f32_e32 v30, 0x3fb8aa3b, v30 + ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v80 + ; GCN-NEXT: v_pack_b32_f16 v26, v65, v85 + ; GCN-NEXT: v_exp_f32_e32 v85, v27 + ; GCN-NEXT: v_add_f32_e32 v64, v91, v64 + ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v80 + ; GCN-NEXT: v_exp_f32_e32 v28, v28 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v81 + ; GCN-NEXT: v_fma_f32 v3, s4, v3, -v80 + ; GCN-NEXT: v_exp_f32_e32 v29, v29 + ; GCN-NEXT: v_exp_f32_e32 v30, v30 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v83 + ; GCN-NEXT: v_mul_f32_e32 v31, 0x3fb8aa3b, v31 + ; GCN-NEXT: v_exp_f32_e32 v31, v31 ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 - ; GCN-NEXT: v_add_f32_e32 v17, v81, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v23 - ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72 - ; GCN-NEXT: v_exp_f32_e32 v68, v2 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v25 + ; GCN-NEXT: v_pack_b32_f16 v27, v68, v65 + ; GCN-NEXT: v_mul_f32_e32 v68, 0x3fb8aa3b, v1 + ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[26:27], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v82 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v87 + ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v84 + ; GCN-NEXT: ds_read_b128 v[16:19], v89 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pack_b32_f16 v4, v18, v4 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v19 - ; GCN-NEXT: v_exp_f32_e32 v24, v24 - ; GCN-NEXT: ds_read_b128 v[18:21], v84 offset:576 + ; GCN-NEXT: v_pack_b32_f16 v0, v65, v66 + ; GCN-NEXT: ds_read_b128 v[20:23], v89 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v26, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v82, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v27 - ; GCN-NEXT: v_exp_f32_e32 v26, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v29, v65 - ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72 - ; GCN-NEXT: v_exp_f32_e32 v67, v67 - ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v83, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v68 - ; GCN-NEXT: v_exp_f32_e32 v6, v6 - ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v24 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[26:27], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v26, v92, v64 + ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v85 + ; GCN-NEXT: v_pack_b32_f16 v1, v27, v64 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[0:1], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v26, v84, v26 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[76:77], v[0:1], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v25, v25, v26 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v30 + ; GCN-NEXT: v_fma_f32 v0, s4, v6, -v80 + ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v31 + ; GCN-NEXT: v_mul_f32_e32 v26, 0x3fb8aa3b, v0 + ; GCN-NEXT: v_pack_b32_f16 v1, v1, v6 + ; GCN-NEXT: v_add_f32_e32 v6, v81, v25 + ; GCN-NEXT: v_add_f32_e32 v6, v83, v6 + ; GCN-NEXT: v_add_f32_e32 v6, v82, v6 + ; GCN-NEXT: v_add_f32_e32 v6, v87, v6 + ; GCN-NEXT: v_add_f32_e32 v6, v69, v6 + ; GCN-NEXT: v_fma_f32 v4, s4, v4, -v80 + ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v4 + ; GCN-NEXT: v_fma_f32 v5, s4, v5, -v80 + ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v80 + ; GCN-NEXT: v_exp_f32_e32 v2, v2 + ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v80 + ; GCN-NEXT: v_exp_f32_e32 v66, v68 + ; GCN-NEXT: v_exp_f32_e32 v3, v3 + ; GCN-NEXT: v_exp_f32_e32 v64, v67 + ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v28 + ; GCN-NEXT: v_exp_f32_e32 v4, v4 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v29 + ; GCN-NEXT: v_mul_f32_e32 v5, 0x3fb8aa3b, v5 ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v7 + ; GCN-NEXT: v_exp_f32_e32 v5, v5 + ; GCN-NEXT: v_pack_b32_f16 v0, v27, v65 + ; GCN-NEXT: v_exp_f32_e32 v26, v26 ; GCN-NEXT: v_exp_f32_e32 v7, v7 - ; GCN-NEXT: v_pack_b32_f16 v4, v28, v29 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v69 - ; GCN-NEXT: ; implicit-def: $sgpr2 + ; GCN-NEXT: v_cvt_f16_f32_e32 v27, v66 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[0:1], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v25, v64 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v3 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[78:79], v[0:1], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v2 + ; GCN-NEXT: v_pack_b32_f16 v0, v25, v27 + ; GCN-NEXT: v_pack_b32_f16 v1, v1, v65 ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v0, v85, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v4, v88, v0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[16:17], v[0:1], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[0:1], v[48:63] ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6 - ; GCN-NEXT: v_exp_f32_e32 v10, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v26 + ; GCN-NEXT: v_fma_f32 v10, s4, v14, -v80 + ; GCN-NEXT: v_exp_f32_e32 v14, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v4 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 - ; GCN-NEXT: v_pack_b32_f16 v0, v17, v28 + ; GCN-NEXT: v_pack_b32_f16 v0, v16, v17 ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v2, v30, v4 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v0, v31, v2 - ; GCN-NEXT: v_add_f32_e32 v0, v22, v0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[0:1], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[22:23], v[0:1], v[48:63] + ; GCN-NEXT: v_add_f32_e32 v0, v85, v6 + ; GCN-NEXT: v_add_f32_e32 v0, v28, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v29, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v30, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v31, v0 ; GCN-NEXT: v_add_f32_e32 v0, v64, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v23, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v25, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v27, v0 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v72 - ; GCN-NEXT: v_add_f32_e32 v0, v65, v0 - ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72 + ; GCN-NEXT: v_add_f32_e32 v0, v66, v0 + ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v80 + ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v80 + ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v80 + ; GCN-NEXT: v_fma_f32 v15, s4, v15, -v80 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v10 + ; GCN-NEXT: v_add_f32_e32 v0, v2, v0 ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8 - ; GCN-NEXT: v_add_f32_e32 v0, v68, v0 - ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72 - ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72 - ; GCN-NEXT: v_exp_f32_e32 v8, v8 - ; GCN-NEXT: v_add_f32_e32 v0, v24, v0 - ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72 - ; GCN-NEXT: v_exp_f32_e32 v9, v9 - ; GCN-NEXT: v_add_f32_e32 v0, v26, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v67, v0 - ; GCN-NEXT: v_fma_f32 v14, s4, v15, -v72 + ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v80 ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v11 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v5 - ; GCN-NEXT: v_add_f32_e32 v0, v6, v0 + ; GCN-NEXT: v_exp_f32_e32 v10, v1 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v15 + ; GCN-NEXT: v_add_f32_e32 v0, v3, v0 + ; GCN-NEXT: v_exp_f32_e32 v18, v1 ; GCN-NEXT: v_exp_f32_e32 v11, v11 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 - ; GCN-NEXT: v_exp_f32_e32 v12, v3 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13 - ; GCN-NEXT: v_exp_f32_e32 v17, v1 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v14 + ; GCN-NEXT: v_exp_f32_e32 v8, v8 + ; GCN-NEXT: v_add_f32_e32 v0, v4, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v5, v0 + ; GCN-NEXT: v_add_f32_e32 v0, v26, v0 + ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9 ; GCN-NEXT: v_add_f32_e32 v0, v7, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v9 - ; GCN-NEXT: v_exp_f32_e32 v15, v3 - ; GCN-NEXT: v_exp_f32_e32 v18, v1 - ; GCN-NEXT: v_add_f32_e32 v6, v8, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v91 + ; GCN-NEXT: v_exp_f32_e32 v9, v9 + ; GCN-NEXT: v_add_f32_e32 v4, v8, v0 + ; GCN-NEXT: ds_read_b128 v[0:3], v90 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11 - ; GCN-NEXT: v_add_f32_e32 v6, v9, v6 - ; GCN-NEXT: v_pack_b32_f16 v8, v4, v13 - ; GCN-NEXT: v_add_f32_e32 v6, v10, v6 - ; GCN-NEXT: v_pack_b32_f16 v9, v5, v14 - ; GCN-NEXT: v_cvt_f16_f32_e32 v7, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v12 - ; GCN-NEXT: v_add_f32_e32 v6, v11, v6 - ; GCN-NEXT: v_add_f32_e32 v6, v12, v6 - ; GCN-NEXT: v_add_f32_e32 v1, v15, v6 - ; GCN-NEXT: v_add_f32_e32 v11, v17, v1 - ; GCN-NEXT: v_pack_b32_f16 v1, v0, v7 - ; GCN-NEXT: v_pack_b32_f16 v0, v4, v10 - ; GCN-NEXT: ds_read_b128 v[4:7], v91 offset:576 + ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v9 + ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v14 + ; GCN-NEXT: v_cvt_f16_f32_e32 v15, v11 + ; GCN-NEXT: v_add_f32_e32 v4, v9, v4 + ; GCN-NEXT: v_pack_b32_f16 v8, v16, v17 + ; GCN-NEXT: v_mul_f32_e32 v12, 0x3fb8aa3b, v12 + ; GCN-NEXT: v_pack_b32_f16 v9, v6, v15 + ; GCN-NEXT: v_exp_f32_e32 v12, v12 + ; GCN-NEXT: v_add_f32_e32 v4, v14, v4 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[0:1], v[8:9], v[32:47] + ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v80 + ; GCN-NEXT: v_mul_f32_e32 v13, 0x3fb8aa3b, v13 + ; GCN-NEXT: v_exp_f32_e32 v13, v13 + ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v18 + ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 + ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v12 + ; GCN-NEXT: v_cvt_f16_f32_e32 v7, v13 + ; GCN-NEXT: v_add_f32_e32 v4, v11, v4 + ; GCN-NEXT: v_add_f32_e32 v4, v12, v4 + ; GCN-NEXT: v_add_f32_e32 v1, v13, v4 + ; GCN-NEXT: v_add_f32_e32 v10, v10, v1 + ; GCN-NEXT: v_pack_b32_f16 v1, v0, v5 + ; GCN-NEXT: v_pack_b32_f16 v0, v6, v7 + ; GCN-NEXT: ds_read_b128 v[4:7], v90 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[4:5], v[8:9], v[48:63] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: v_mov_b32_e32 v4, 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v2, v18, v11 - ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[6:7], v[0:1], v[48:63] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[2:3], v[0:1], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v2, v18, v10 + ; GCN-NEXT: ds_bpermute_b32 v3, v86, v2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_f32_e32 v2, v2, v3 - ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 + ; GCN-NEXT: ds_bpermute_b32 v3, v86, v2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] - ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v16 + ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v24 ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir index a85478df10eb2..670f3f74f3730 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir @@ -57,29 +57,29 @@ body: | ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = COPY [[DEF1]] ; GCN-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF2]], 0, 0, implicit $exec :: (load (s128) from %ir.in0, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 0, 0, implicit $exec :: (load (s128) from %ir.in2, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[DS_READ_B128_gfx9_2:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF2]], 1040, 0, implicit $exec :: (load (s128) from %ir.in1, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[DS_READ_B128_gfx9_3:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 2064, 0, implicit $exec :: (load (s128) from %ir.in3, !alias.scope !0, addrspace 3) - ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = COPY [[DEF1]] ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_1]].sub0_sub1, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[DS_READ_B128_gfx9_4:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 1024, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF33]], implicit $exec ; GCN-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF21]], implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_4:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 1024, 0, implicit $exec :: (load (s128) from %ir.in4, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub2_sub3, [[DS_READ_B128_gfx9_1]].sub2_sub3, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[DS_READ_B128_gfx9_5:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 3088, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF22]], implicit $exec ; GCN-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF23]], implicit $exec + ; GCN-NEXT: [[DS_READ_B128_gfx9_5:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF3]], 3088, 0, implicit $exec :: (load (s128) from %ir.in5, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_3]].sub0_sub1, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF4]], [[DEF16]], 0, 0, implicit $exec :: (store (s128) into %ir.in6, !alias.scope !0, addrspace 3) ; GCN-NEXT: [[DEF16:%[0-9]+]]:av_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF6]], [[DEF7]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in7, !alias.scope !0, addrspace 7) ; GCN-NEXT: dead [[COPY:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_3]].sub2_sub3, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[COPY1:%[0-9]+]]:areg_512_align2 = COPY [[DEF]] ; GCN-NEXT: undef [[DEF17:%[0-9]+]].sub2:vreg_128_align2 = V_PERM_B32_e64 [[DEF13]], [[DEF12]], [[DEF30]], implicit $exec ; GCN-NEXT: [[DEF17:%[0-9]+]].sub3:vreg_128_align2 = V_PERM_B32_e64 [[DEF15]], [[DEF14]], [[DEF30]], implicit $exec ; GCN-NEXT: [[DEF17:%[0-9]+]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF30]], implicit $exec ; GCN-NEXT: [[DEF17:%[0-9]+]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF11]], [[DEF10]], [[DEF30]], implicit $exec ; GCN-NEXT: DS_WRITE_B128_gfx9 [[DEF5]], [[DEF17]], 0, 0, implicit $exec :: (store (s128) into %ir.in8, !alias.scope !0, addrspace 3) + ; GCN-NEXT: [[COPY1:%[0-9]+]]:areg_512_align2 = COPY [[DEF]] ; GCN-NEXT: [[COPY1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[DS_READ_B128_gfx9_4]].sub0_sub1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: undef [[DEF18:%[0-9]+]].sub0:vreg_128_align2 = V_PERM_B32_e64 [[DEF8]], [[DEF9]], [[DEF31]], implicit $exec ; GCN-NEXT: [[DEF18:%[0-9]+]].sub1:vreg_128_align2 = V_PERM_B32_e64 [[DEF11]], [[DEF10]], [[DEF31]], implicit $exec @@ -94,11 +94,11 @@ body: | ; GCN-NEXT: [[COPY1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub0_sub1, [[DS_READ_B128_gfx9_5]].sub0_sub1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF24]], implicit $exec ; GCN-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF25]], implicit $exec - ; GCN-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF26]], implicit $exec - ; GCN-NEXT: [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF27]], implicit $exec ; GCN-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_4]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in14, !alias.scope !0, addrspace 7) ; GCN-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_5]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in15, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF26]], implicit $exec ; GCN-NEXT: [[DEF14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_6]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in16, !alias.scope !0, addrspace 7) + ; GCN-NEXT: [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF19]], [[DEF27]], implicit $exec ; GCN-NEXT: [[DEF15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_U32_e32_7]], [[DEF32]], 0, 0, 0, 0, implicit $exec :: (load (s128) from %ir.in17, !alias.scope !0, addrspace 7) ; GCN-NEXT: dead [[COPY1:%[0-9]+]]:areg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_e64 [[DS_READ_B128_gfx9_2]].sub2_sub3, [[DS_READ_B128_gfx9_5]].sub2_sub3, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: IGLP_OPT 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index e7d7f87e4fc4c..8a0bca97f0ffd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -5911,19 +5911,18 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v31, a7 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v30, a6 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v29, a5 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 -; NOLIT-SRCC-NEXT: s_nop 0 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v28, a4 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] -; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_vecarg: @@ -6006,19 +6005,18 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v31, a7 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v30, a6 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v29, a5 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 -; LIT-SRCC-NEXT: s_nop 0 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 -; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 +; LIT-SRCC-NEXT: v_accvgpr_read_b32 v28, a4 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] -; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:16 ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x1f32_vecarg: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll index d2712ac8e08a3..e86698828894b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll @@ -18,57 +18,31 @@ entry: } define amdgpu_ps void @tensor_load_to_lds_vector(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3) { -; GFX1250-SDAG-LABEL: tensor_load_to_lds_vector: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v5 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s4, v8 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s9, v1 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s10, v2 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s11, v3 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s5, v9 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s6, v10 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s7, v11 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s12, v12 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s13, v13 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s14, v14 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s15, v15 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s16, v16 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s17, v17 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s18, v18 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s19, v19 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: tensor_load_to_lds s[8:11], s[0:7], s[12:15], s[16:19] -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: tensor_load_to_lds_vector: -; GFX1250-GISEL: ; %bb.0: ; %entry -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s9, v1 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s10, v2 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s11, v3 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v4 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v5 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v8 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s5, v9 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s6, v10 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s7, v11 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s12, v12 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s13, v13 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s14, v14 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s15, v15 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s16, v16 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s17, v17 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s18, v18 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s19, v19 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: tensor_load_to_lds s[8:11], s[0:7], s[12:15], s[16:19] -; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-LABEL: tensor_load_to_lds_vector: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1250-NEXT: v_readfirstlane_b32 s9, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s10, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1250-NEXT: v_readfirstlane_b32 s2, v6 +; GFX1250-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1250-NEXT: v_readfirstlane_b32 s4, v8 +; GFX1250-NEXT: v_readfirstlane_b32 s5, v9 +; GFX1250-NEXT: v_readfirstlane_b32 s6, v10 +; GFX1250-NEXT: v_readfirstlane_b32 s7, v11 +; GFX1250-NEXT: v_readfirstlane_b32 s12, v12 +; GFX1250-NEXT: v_readfirstlane_b32 s13, v13 +; GFX1250-NEXT: v_readfirstlane_b32 s14, v14 +; GFX1250-NEXT: v_readfirstlane_b32 s15, v15 +; GFX1250-NEXT: v_readfirstlane_b32 s16, v16 +; GFX1250-NEXT: v_readfirstlane_b32 s17, v17 +; GFX1250-NEXT: v_readfirstlane_b32 s18, v18 +; GFX1250-NEXT: v_readfirstlane_b32 s19, v19 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: tensor_load_to_lds s[8:11], s[0:7], s[12:15], s[16:19] +; GFX1250-NEXT: s_endpgm entry: call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, i32 0) ret void @@ -86,41 +60,23 @@ entry: } define amdgpu_ps void @tensor_load_to_lds_d2_vector(<4 x i32> %D0, <8 x i32> %D1) { -; GFX1250-SDAG-LABEL: tensor_load_to_lds_d2_vector: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s9, v1 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s10, v2 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s11, v3 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v5 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s4, v8 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s5, v9 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s6, v10 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s7, v11 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: tensor_load_to_lds s[8:11], s[0:7] th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: tensor_load_to_lds_d2_vector: -; GFX1250-GISEL: ; %bb.0: ; %entry -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s9, v1 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s10, v2 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s11, v3 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v4 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v5 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v8 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s5, v9 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s6, v10 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s7, v11 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: tensor_load_to_lds s[8:11], s[0:7] th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-LABEL: tensor_load_to_lds_d2_vector: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1250-NEXT: v_readfirstlane_b32 s9, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s10, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1250-NEXT: v_readfirstlane_b32 s2, v6 +; GFX1250-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1250-NEXT: v_readfirstlane_b32 s4, v8 +; GFX1250-NEXT: v_readfirstlane_b32 s5, v9 +; GFX1250-NEXT: v_readfirstlane_b32 s6, v10 +; GFX1250-NEXT: v_readfirstlane_b32 s7, v11 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: tensor_load_to_lds s[8:11], s[0:7] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> %D0, <8 x i32> %D1, i32 27) ret void @@ -137,57 +93,31 @@ entry: } define amdgpu_ps void @tensor_store_from_lds_vector(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3) { -; GFX1250-SDAG-LABEL: tensor_store_from_lds_vector: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v5 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s4, v8 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s9, v1 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s10, v2 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s11, v3 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s5, v9 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s6, v10 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s7, v11 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s12, v12 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s13, v13 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s14, v14 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s15, v15 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s16, v16 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s17, v17 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s18, v18 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s19, v19 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: tensor_store_from_lds s[8:11], s[0:7], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: tensor_store_from_lds_vector: -; GFX1250-GISEL: ; %bb.0: ; %entry -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s9, v1 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s10, v2 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s11, v3 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v4 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v5 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v8 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s5, v9 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s6, v10 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s7, v11 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s12, v12 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s13, v13 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s14, v14 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s15, v15 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s16, v16 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s17, v17 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s18, v18 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s19, v19 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: tensor_store_from_lds s[8:11], s[0:7], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-LABEL: tensor_store_from_lds_vector: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1250-NEXT: v_readfirstlane_b32 s9, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s10, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1250-NEXT: v_readfirstlane_b32 s2, v6 +; GFX1250-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1250-NEXT: v_readfirstlane_b32 s4, v8 +; GFX1250-NEXT: v_readfirstlane_b32 s5, v9 +; GFX1250-NEXT: v_readfirstlane_b32 s6, v10 +; GFX1250-NEXT: v_readfirstlane_b32 s7, v11 +; GFX1250-NEXT: v_readfirstlane_b32 s12, v12 +; GFX1250-NEXT: v_readfirstlane_b32 s13, v13 +; GFX1250-NEXT: v_readfirstlane_b32 s14, v14 +; GFX1250-NEXT: v_readfirstlane_b32 s15, v15 +; GFX1250-NEXT: v_readfirstlane_b32 s16, v16 +; GFX1250-NEXT: v_readfirstlane_b32 s17, v17 +; GFX1250-NEXT: v_readfirstlane_b32 s18, v18 +; GFX1250-NEXT: v_readfirstlane_b32 s19, v19 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: tensor_store_from_lds s[8:11], s[0:7], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, i32 22) ret void @@ -204,42 +134,27 @@ entry: } define amdgpu_ps void @tensor_store_from_lds_d2_vector(<4 x i32> %D0, <8 x i32> %D1) { -; GFX1250-SDAG-LABEL: tensor_store_from_lds_d2_vector: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s9, v1 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s10, v2 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s11, v3 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v5 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s4, v8 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s5, v9 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s6, v10 -; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s7, v11 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: tensor_store_from_lds s[8:11], s[0:7] -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: tensor_store_from_lds_d2_vector: -; GFX1250-GISEL: ; %bb.0: ; %entry -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s8, v0 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s9, v1 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s10, v2 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s11, v3 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s0, v4 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s1, v5 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s2, v6 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v7 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v8 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s5, v9 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s6, v10 -; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s7, v11 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: tensor_store_from_lds s[8:11], s[0:7] -; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-LABEL: tensor_store_from_lds_d2_vector: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1250-NEXT: v_readfirstlane_b32 s9, v1 +; GFX1250-NEXT: v_readfirstlane_b32 s10, v2 +; GFX1250-NEXT: v_readfirstlane_b32 s11, v3 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v4 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v5 +; GFX1250-NEXT: v_readfirstlane_b32 s2, v6 +; GFX1250-NEXT: v_readfirstlane_b32 s3, v7 +; GFX1250-NEXT: v_readfirstlane_b32 s4, v8 +; GFX1250-NEXT: v_readfirstlane_b32 s5, v9 +; GFX1250-NEXT: v_readfirstlane_b32 s6, v10 +; GFX1250-NEXT: v_readfirstlane_b32 s7, v11 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: tensor_store_from_lds s[8:11], s[0:7] +; GFX1250-NEXT: s_endpgm entry: call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> %D0, <8 x i32> %D1, i32 0) ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1250-GISEL: {{.*}} +; GFX1250-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 76b97e843d777..7c90a5e17094a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -569,6 +569,7 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; CI-LABEL: round_v8f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: s_brev_b32 s6, -2 ; CI-NEXT: v_mov_b32_e32 v12, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -576,78 +577,76 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; CI-NEXT: v_trunc_f64_e32 v[4:5], s[8:9] ; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] ; CI-NEXT: v_add_f64 v[6:7], s[8:9], -v[4:5] -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 -; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[6:7]|, 0.5 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: s_cselect_b32 s7, 0x3ff00000, 0 +; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[2:3]|, 0.5 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[6:7]|, 0.5 +; CI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v8, s11 -; CI-NEXT: s_and_b64 s[0:1], s[2:3], exec -; CI-NEXT: v_mov_b32_e32 v2, s7 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: s_and_b64 s[2:3], s[4:5], exec ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[14:15] ; CI-NEXT: v_bfi_b32 v13, s6, v2, v8 -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[12:13] -; CI-NEXT: v_mov_b32_e32 v8, s0 +; CI-NEXT: v_mov_b32_e32 v8, s2 ; CI-NEXT: v_mov_b32_e32 v9, s9 ; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[6:7] ; CI-NEXT: v_bfi_b32 v13, s6, v8, v9 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 +; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[0:1]|, 0.5 ; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[12:13] ; CI-NEXT: v_trunc_f64_e32 v[4:5], s[12:13] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_and_b64 s[2:3], s[2:3], exec ; CI-NEXT: v_add_f64 v[8:9], s[12:13], -v[4:5] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v10, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[8:9]|, 0.5 +; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v10, s2 +; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[8:9]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[18:19] ; CI-NEXT: v_mov_b32_e32 v11, s15 ; CI-NEXT: v_bfi_b32 v13, s6, v10, v11 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: s_and_b64 s[2:3], s[2:3], exec ; CI-NEXT: v_add_f64 v[10:11], s[18:19], -v[8:9] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[12:13] -; CI-NEXT: v_mov_b32_e32 v13, s0 +; CI-NEXT: v_mov_b32_e32 v13, s2 ; CI-NEXT: v_mov_b32_e32 v14, s13 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 +; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[10:11]|, 0.5 ; CI-NEXT: v_bfi_b32 v13, s6, v13, v14 ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[10:11], s[16:17], -v[14:15] ; CI-NEXT: v_add_f64 v[4:5], v[4:5], v[12:13] -; CI-NEXT: v_mov_b32_e32 v13, s0 +; CI-NEXT: v_mov_b32_e32 v13, s2 ; CI-NEXT: v_mov_b32_e32 v16, s19 ; CI-NEXT: v_bfi_b32 v13, s6, v13, v16 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 +; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[10:11]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[22:23] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_add_f64 v[18:19], s[22:23], -v[16:17] -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[10:11], v[8:9], v[12:13] -; CI-NEXT: v_mov_b32_e32 v8, s0 -; CI-NEXT: v_mov_b32_e32 v9, s17 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 +; CI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; CI-NEXT: v_add_f64 v[8:9], s[22:23], -v[16:17] +; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[8:9]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v18, s17 +; CI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; CI-NEXT: v_bfi_b32 v13, s6, v13, v18 +; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; CI-NEXT: v_trunc_f64_e32 v[18:19], s[20:21] -; CI-NEXT: v_bfi_b32 v13, s6, v8, v9 ; CI-NEXT: v_add_f64 v[8:9], v[14:15], v[12:13] -; CI-NEXT: v_add_f64 v[13:14], s[20:21], -v[18:19] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[13:14]|, 0.5 -; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v13, s2 ; CI-NEXT: v_mov_b32_e32 v14, s23 -; CI-NEXT: v_mov_b32_e32 v20, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; CI-NEXT: v_bfi_b32 v13, s6, v13, v14 -; CI-NEXT: v_mov_b32_e32 v21, s21 +; CI-NEXT: v_add_f64 v[14:15], s[20:21], -v[18:19] +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[14:15]|, 0.5 ; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[12:13] -; CI-NEXT: v_bfi_b32 v13, s6, v20, v21 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: v_mov_b32_e32 v16, s21 +; CI-NEXT: v_bfi_b32 v13, s6, v13, v16 ; CI-NEXT: v_add_f64 v[12:13], v[18:19], v[12:13] -; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 9fdc72f054f90..f4e3a714ae6d7 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -2075,41 +2075,41 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ushort v18, v[0:1] +; GFX8-NEXT: flat_load_ushort v20, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, s3 -; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v13, s1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v12, s0 ; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v17, s3 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: v_mov_b32_e32 v17, s1 -; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: v_mov_b32_e32 v16, s2 +; GFX8-NEXT: v_mov_b32_e32 v19, s1 +; GFX8-NEXT: v_mov_b32_e32 v18, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_bfe_i32 v7, v18, 15, 1 -; GFX8-NEXT: v_bfe_i32 v6, v18, 14, 1 -; GFX8-NEXT: v_bfe_i32 v5, v18, 13, 1 -; GFX8-NEXT: v_bfe_i32 v4, v18, 12, 1 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NEXT: v_bfe_i32 v11, v18, 11, 1 -; GFX8-NEXT: v_bfe_i32 v10, v18, 10, 1 -; GFX8-NEXT: v_bfe_i32 v9, v18, 9, 1 -; GFX8-NEXT: v_bfe_i32 v8, v18, 8, 1 -; GFX8-NEXT: v_bfe_i32 v3, v18, 3, 1 -; GFX8-NEXT: v_bfe_i32 v2, v18, 2, 1 -; GFX8-NEXT: v_bfe_i32 v1, v18, 1, 1 -; GFX8-NEXT: v_bfe_i32 v0, v18, 0, 1 -; GFX8-NEXT: v_bfe_i32 v7, v18, 7, 1 -; GFX8-NEXT: v_bfe_i32 v6, v18, 6, 1 -; GFX8-NEXT: v_bfe_i32 v5, v18, 5, 1 -; GFX8-NEXT: v_bfe_i32 v4, v18, 4, 1 +; GFX8-NEXT: v_bfe_i32 v11, v20, 15, 1 +; GFX8-NEXT: v_bfe_i32 v10, v20, 14, 1 +; GFX8-NEXT: v_bfe_i32 v9, v20, 13, 1 +; GFX8-NEXT: v_bfe_i32 v8, v20, 12, 1 ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; GFX8-NEXT: v_bfe_i32 v3, v20, 3, 1 +; GFX8-NEXT: v_bfe_i32 v11, v20, 11, 1 +; GFX8-NEXT: v_bfe_i32 v10, v20, 10, 1 +; GFX8-NEXT: v_bfe_i32 v9, v20, 9, 1 +; GFX8-NEXT: v_bfe_i32 v8, v20, 8, 1 +; GFX8-NEXT: v_bfe_i32 v2, v20, 2, 1 +; GFX8-NEXT: v_bfe_i32 v1, v20, 1, 1 +; GFX8-NEXT: v_bfe_i32 v0, v20, 0, 1 +; GFX8-NEXT: v_bfe_i32 v7, v20, 7, 1 +; GFX8-NEXT: v_bfe_i32 v6, v20, 6, 1 +; GFX8-NEXT: v_bfe_i32 v5, v20, 5, 1 +; GFX8-NEXT: v_bfe_i32 v4, v20, 4, 1 +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[4:7] ; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -3177,164 +3177,164 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s4, s2, 0x10003 -; GFX6-NEXT: s_bfe_u32 s5, s2, 0x10001 -; GFX6-NEXT: s_bfe_u32 s6, s2, 0x10007 -; GFX6-NEXT: s_bfe_u32 s7, s2, 0x10005 -; GFX6-NEXT: s_bfe_u32 s8, s2, 0x1000b -; GFX6-NEXT: s_bfe_u32 s9, s2, 0x10009 -; GFX6-NEXT: s_bfe_u32 s10, s2, 0x1000f -; GFX6-NEXT: s_bfe_u32 s13, s2, 0x1000d -; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10013 -; GFX6-NEXT: s_bfe_u32 s15, s2, 0x10011 -; GFX6-NEXT: s_bfe_u32 s16, s2, 0x10017 -; GFX6-NEXT: s_bfe_u32 s17, s2, 0x10015 -; GFX6-NEXT: s_bfe_u32 s18, s2, 0x1001b -; GFX6-NEXT: s_bfe_u32 s19, s2, 0x10019 -; GFX6-NEXT: s_lshr_b32 s20, s2, 31 -; GFX6-NEXT: s_bfe_u32 s21, s2, 0x1001d -; GFX6-NEXT: s_bfe_u32 s22, s3, 0x10003 -; GFX6-NEXT: s_bfe_u32 s23, s3, 0x10001 -; GFX6-NEXT: s_bfe_u32 s24, s3, 0x10007 -; GFX6-NEXT: s_bfe_u32 s25, s3, 0x10005 -; GFX6-NEXT: s_bfe_u32 s26, s3, 0x1000b -; GFX6-NEXT: s_bfe_u32 s27, s3, 0x10009 -; GFX6-NEXT: s_bfe_u32 s28, s3, 0x1000f -; GFX6-NEXT: s_bfe_u32 s29, s3, 0x1000d -; GFX6-NEXT: s_bfe_u32 s30, s3, 0x10013 -; GFX6-NEXT: s_bfe_u32 s31, s3, 0x10011 -; GFX6-NEXT: s_bfe_u32 s33, s3, 0x10017 -; GFX6-NEXT: s_bfe_u32 s34, s3, 0x10015 -; GFX6-NEXT: s_bfe_u32 s35, s3, 0x1001b -; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10019 -; GFX6-NEXT: s_lshr_b32 s37, s3, 31 -; GFX6-NEXT: s_bfe_u32 s38, s3, 0x1001d -; GFX6-NEXT: s_and_b32 s12, s2, 1 -; GFX6-NEXT: s_bfe_u32 s11, s2, 0x10002 -; GFX6-NEXT: s_bfe_u32 s39, s2, 0x10006 -; GFX6-NEXT: s_bfe_u32 s40, s2, 0x10004 -; GFX6-NEXT: s_bfe_u32 s41, s2, 0x1000a -; GFX6-NEXT: s_bfe_u32 s42, s2, 0x10008 -; GFX6-NEXT: s_bfe_u32 s43, s2, 0x1000e -; GFX6-NEXT: s_bfe_u32 s44, s2, 0x1000c -; GFX6-NEXT: s_bfe_u32 s45, s2, 0x10012 -; GFX6-NEXT: s_bfe_u32 s46, s2, 0x10010 -; GFX6-NEXT: s_bfe_u32 s47, s2, 0x10016 -; GFX6-NEXT: s_bfe_u32 s48, s2, 0x10014 -; GFX6-NEXT: s_bfe_u32 s49, s2, 0x1001a -; GFX6-NEXT: s_bfe_u32 s50, s2, 0x10018 -; GFX6-NEXT: s_bfe_u32 s51, s2, 0x1001e -; GFX6-NEXT: s_bfe_u32 s52, s2, 0x1001c -; GFX6-NEXT: s_and_b32 s53, s3, 1 -; GFX6-NEXT: s_bfe_u32 s54, s3, 0x10002 -; GFX6-NEXT: s_bfe_u32 s55, s3, 0x10006 -; GFX6-NEXT: s_bfe_u32 s56, s3, 0x10004 -; GFX6-NEXT: s_bfe_u32 s57, s3, 0x10008 -; GFX6-NEXT: s_bfe_u32 s58, s3, 0x1000e -; GFX6-NEXT: s_bfe_u32 s59, s3, 0x1000c -; GFX6-NEXT: s_bfe_u32 s60, s3, 0x10012 -; GFX6-NEXT: s_bfe_u32 s61, s3, 0x10010 -; GFX6-NEXT: s_bfe_u32 s62, s3, 0x10016 -; GFX6-NEXT: s_bfe_u32 s63, s3, 0x10014 -; GFX6-NEXT: s_bfe_u32 s64, s3, 0x1001a -; GFX6-NEXT: s_bfe_u32 s65, s3, 0x10018 -; GFX6-NEXT: s_bfe_u32 s66, s3, 0x1001e -; GFX6-NEXT: s_bfe_u32 s67, s3, 0x1001c -; GFX6-NEXT: s_bfe_u32 s68, s3, 0x1000a +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s67 -; GFX6-NEXT: v_mov_b32_e32 v1, s38 -; GFX6-NEXT: v_mov_b32_e32 v2, s66 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NEXT: v_mov_b32_e32 v4, s65 -; GFX6-NEXT: v_mov_b32_e32 v5, s36 -; GFX6-NEXT: v_mov_b32_e32 v6, s64 -; GFX6-NEXT: v_mov_b32_e32 v7, s35 -; GFX6-NEXT: v_mov_b32_e32 v8, s63 -; GFX6-NEXT: v_mov_b32_e32 v9, s34 -; GFX6-NEXT: v_mov_b32_e32 v10, s62 -; GFX6-NEXT: v_mov_b32_e32 v11, s33 -; GFX6-NEXT: v_mov_b32_e32 v12, s61 -; GFX6-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NEXT: v_mov_b32_e32 v14, s60 -; GFX6-NEXT: v_mov_b32_e32 v15, s30 -; GFX6-NEXT: v_mov_b32_e32 v16, s59 -; GFX6-NEXT: v_mov_b32_e32 v17, s29 -; GFX6-NEXT: v_mov_b32_e32 v18, s58 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_u32 s6, s4, 0x10003 +; GFX6-NEXT: s_bfe_u32 s7, s4, 0x10001 +; GFX6-NEXT: s_bfe_u32 s8, s4, 0x10007 +; GFX6-NEXT: s_bfe_u32 s9, s4, 0x10005 +; GFX6-NEXT: s_bfe_u32 s10, s4, 0x1000b +; GFX6-NEXT: s_bfe_u32 s11, s4, 0x10009 +; GFX6-NEXT: s_bfe_u32 s12, s4, 0x1000f +; GFX6-NEXT: s_bfe_u32 s15, s4, 0x1000d +; GFX6-NEXT: s_bfe_u32 s16, s4, 0x10013 +; GFX6-NEXT: s_bfe_u32 s17, s4, 0x10011 +; GFX6-NEXT: s_bfe_u32 s18, s4, 0x10017 +; GFX6-NEXT: s_bfe_u32 s19, s4, 0x10015 +; GFX6-NEXT: s_bfe_u32 s20, s4, 0x1001b +; GFX6-NEXT: s_bfe_u32 s21, s4, 0x10019 +; GFX6-NEXT: s_lshr_b32 s22, s4, 31 +; GFX6-NEXT: s_bfe_u32 s23, s4, 0x1001d +; GFX6-NEXT: s_bfe_u32 s24, s5, 0x10003 +; GFX6-NEXT: s_bfe_u32 s25, s5, 0x10001 +; GFX6-NEXT: s_bfe_u32 s26, s5, 0x10007 +; GFX6-NEXT: s_bfe_u32 s27, s5, 0x10005 +; GFX6-NEXT: s_bfe_u32 s28, s5, 0x1000b +; GFX6-NEXT: s_bfe_u32 s29, s5, 0x10009 +; GFX6-NEXT: s_bfe_u32 s30, s5, 0x1000f +; GFX6-NEXT: s_bfe_u32 s31, s5, 0x1000d +; GFX6-NEXT: s_bfe_u32 s33, s5, 0x10013 +; GFX6-NEXT: s_bfe_u32 s34, s5, 0x10011 +; GFX6-NEXT: s_bfe_u32 s35, s5, 0x10017 +; GFX6-NEXT: s_bfe_u32 s36, s5, 0x10015 +; GFX6-NEXT: s_bfe_u32 s37, s5, 0x1001b +; GFX6-NEXT: s_bfe_u32 s38, s5, 0x10019 +; GFX6-NEXT: s_lshr_b32 s39, s5, 31 +; GFX6-NEXT: s_bfe_u32 s40, s5, 0x1001d +; GFX6-NEXT: s_and_b32 s14, s4, 1 +; GFX6-NEXT: s_bfe_u32 s13, s4, 0x10002 +; GFX6-NEXT: s_bfe_u32 s41, s4, 0x10006 +; GFX6-NEXT: s_bfe_u32 s42, s4, 0x10004 +; GFX6-NEXT: s_bfe_u32 s43, s5, 0x1001c +; GFX6-NEXT: s_bfe_u32 s44, s4, 0x1000a +; GFX6-NEXT: v_mov_b32_e32 v0, s43 +; GFX6-NEXT: s_bfe_u32 s43, s5, 0x1001e +; GFX6-NEXT: v_mov_b32_e32 v1, s40 +; GFX6-NEXT: s_bfe_u32 s40, s4, 0x10008 +; GFX6-NEXT: v_mov_b32_e32 v2, s43 +; GFX6-NEXT: s_bfe_u32 s43, s5, 0x10018 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: s_bfe_u32 s39, s4, 0x1000e +; GFX6-NEXT: v_mov_b32_e32 v4, s43 +; GFX6-NEXT: s_bfe_u32 s43, s5, 0x1001a +; GFX6-NEXT: v_mov_b32_e32 v5, s38 +; GFX6-NEXT: s_bfe_u32 s38, s4, 0x1000c +; GFX6-NEXT: v_mov_b32_e32 v6, s43 +; GFX6-NEXT: s_bfe_u32 s43, s5, 0x10014 +; GFX6-NEXT: v_mov_b32_e32 v7, s37 +; GFX6-NEXT: s_bfe_u32 s37, s4, 0x10012 +; GFX6-NEXT: v_mov_b32_e32 v8, s43 +; GFX6-NEXT: s_bfe_u32 s43, s5, 0x10016 +; GFX6-NEXT: v_mov_b32_e32 v9, s36 +; GFX6-NEXT: s_bfe_u32 s36, s4, 0x10010 +; GFX6-NEXT: v_mov_b32_e32 v10, s43 +; GFX6-NEXT: s_bfe_u32 s43, s5, 0x10010 +; GFX6-NEXT: v_mov_b32_e32 v11, s35 +; GFX6-NEXT: s_bfe_u32 s35, s4, 0x10016 +; GFX6-NEXT: v_mov_b32_e32 v12, s43 +; GFX6-NEXT: s_bfe_u32 s43, s5, 0x10012 +; GFX6-NEXT: v_mov_b32_e32 v13, s34 +; GFX6-NEXT: s_bfe_u32 s34, s4, 0x10014 +; GFX6-NEXT: v_mov_b32_e32 v14, s43 +; GFX6-NEXT: s_bfe_u32 s43, s5, 0x1000c +; GFX6-NEXT: v_mov_b32_e32 v15, s33 +; GFX6-NEXT: s_bfe_u32 s33, s4, 0x1001a +; GFX6-NEXT: v_mov_b32_e32 v16, s43 +; GFX6-NEXT: s_bfe_u32 s43, s5, 0x1000e +; GFX6-NEXT: v_mov_b32_e32 v17, s31 +; GFX6-NEXT: s_bfe_u32 s31, s4, 0x10018 +; GFX6-NEXT: v_mov_b32_e32 v18, s43 +; GFX6-NEXT: s_bfe_u32 s43, s4, 0x1001e +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x1001c +; GFX6-NEXT: v_mov_b32_e32 v19, s30 +; GFX6-NEXT: s_bfe_u32 s30, s5, 0x10008 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s57 -; GFX6-NEXT: v_mov_b32_e32 v19, s28 -; GFX6-NEXT: v_mov_b32_e32 v1, s27 -; GFX6-NEXT: v_mov_b32_e32 v2, s68 -; GFX6-NEXT: v_mov_b32_e32 v3, s26 +; GFX6-NEXT: v_mov_b32_e32 v0, s30 +; GFX6-NEXT: s_bfe_u32 s30, s5, 0x1000a +; GFX6-NEXT: v_mov_b32_e32 v1, s29 +; GFX6-NEXT: s_and_b32 s29, s5, 1 +; GFX6-NEXT: v_mov_b32_e32 v2, s30 +; GFX6-NEXT: s_bfe_u32 s30, s5, 0x10002 +; GFX6-NEXT: v_mov_b32_e32 v3, s28 +; GFX6-NEXT: s_bfe_u32 s28, s5, 0x10006 +; GFX6-NEXT: s_bfe_u32 s5, s5, 0x10004 ; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s56 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s27 +; GFX6-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NEXT: v_mov_b32_e32 v3, s26 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s29 ; GFX6-NEXT: v_mov_b32_e32 v1, s25 -; GFX6-NEXT: v_mov_b32_e32 v2, s55 +; GFX6-NEXT: v_mov_b32_e32 v2, s30 ; GFX6-NEXT: v_mov_b32_e32 v3, s24 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s53 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s23 -; GFX6-NEXT: v_mov_b32_e32 v2, s54 +; GFX6-NEXT: v_mov_b32_e32 v2, s43 ; GFX6-NEXT: v_mov_b32_e32 v3, s22 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s52 +; GFX6-NEXT: v_mov_b32_e32 v0, s31 ; GFX6-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NEXT: v_mov_b32_e32 v2, s51 +; GFX6-NEXT: v_mov_b32_e32 v2, s33 ; GFX6-NEXT: v_mov_b32_e32 v3, s20 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s50 +; GFX6-NEXT: v_mov_b32_e32 v0, s34 ; GFX6-NEXT: v_mov_b32_e32 v1, s19 -; GFX6-NEXT: v_mov_b32_e32 v2, s49 +; GFX6-NEXT: v_mov_b32_e32 v2, s35 ; GFX6-NEXT: v_mov_b32_e32 v3, s18 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s48 +; GFX6-NEXT: v_mov_b32_e32 v0, s36 ; GFX6-NEXT: v_mov_b32_e32 v1, s17 -; GFX6-NEXT: v_mov_b32_e32 v2, s47 +; GFX6-NEXT: v_mov_b32_e32 v2, s37 ; GFX6-NEXT: v_mov_b32_e32 v3, s16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s46 +; GFX6-NEXT: v_mov_b32_e32 v0, s38 ; GFX6-NEXT: v_mov_b32_e32 v1, s15 -; GFX6-NEXT: v_mov_b32_e32 v2, s45 -; GFX6-NEXT: v_mov_b32_e32 v3, s14 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GFX6-NEXT: v_mov_b32_e32 v2, s39 +; GFX6-NEXT: v_mov_b32_e32 v3, s12 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s44 -; GFX6-NEXT: v_mov_b32_e32 v1, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s43 +; GFX6-NEXT: v_mov_b32_e32 v0, s40 +; GFX6-NEXT: v_mov_b32_e32 v1, s11 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 ; GFX6-NEXT: v_mov_b32_e32 v3, s10 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s42 ; GFX6-NEXT: v_mov_b32_e32 v1, s9 ; GFX6-NEXT: v_mov_b32_e32 v2, s41 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: v_mov_b32_e32 v2, s39 +; GFX6-NEXT: v_mov_b32_e32 v2, s13 ; GFX6-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s11 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -4013,164 +4013,164 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s4, s2, 0x10003 -; GFX6-NEXT: s_bfe_i32 s5, s2, 0x10002 -; GFX6-NEXT: s_bfe_i32 s6, s2, 0x10001 -; GFX6-NEXT: s_bfe_i32 s7, s2, 0x10000 -; GFX6-NEXT: s_bfe_i32 s8, s2, 0x10007 -; GFX6-NEXT: s_bfe_i32 s9, s2, 0x10006 -; GFX6-NEXT: s_bfe_i32 s10, s2, 0x10005 -; GFX6-NEXT: s_bfe_i32 s11, s2, 0x10004 -; GFX6-NEXT: s_bfe_i32 s12, s2, 0x1000b -; GFX6-NEXT: s_bfe_i32 s13, s2, 0x1000a -; GFX6-NEXT: s_bfe_i32 s14, s2, 0x10009 -; GFX6-NEXT: s_bfe_i32 s15, s2, 0x10008 -; GFX6-NEXT: s_bfe_i32 s16, s2, 0x1000f -; GFX6-NEXT: s_bfe_i32 s17, s2, 0x1000e -; GFX6-NEXT: s_bfe_i32 s18, s2, 0x1000d -; GFX6-NEXT: s_bfe_i32 s19, s2, 0x1000c -; GFX6-NEXT: s_bfe_i32 s20, s2, 0x10013 -; GFX6-NEXT: s_bfe_i32 s21, s2, 0x10012 -; GFX6-NEXT: s_bfe_i32 s22, s2, 0x10011 -; GFX6-NEXT: s_bfe_i32 s23, s2, 0x10010 -; GFX6-NEXT: s_bfe_i32 s24, s2, 0x10017 -; GFX6-NEXT: s_bfe_i32 s25, s2, 0x10016 -; GFX6-NEXT: s_bfe_i32 s26, s2, 0x10015 -; GFX6-NEXT: s_bfe_i32 s27, s2, 0x10014 -; GFX6-NEXT: s_bfe_i32 s28, s2, 0x1001b -; GFX6-NEXT: s_bfe_i32 s29, s2, 0x1001a -; GFX6-NEXT: s_bfe_i32 s30, s2, 0x10019 -; GFX6-NEXT: s_bfe_i32 s31, s2, 0x10018 -; GFX6-NEXT: s_ashr_i32 s33, s2, 31 -; GFX6-NEXT: s_bfe_i32 s34, s2, 0x1001e -; GFX6-NEXT: s_bfe_i32 s35, s2, 0x1001d -; GFX6-NEXT: s_bfe_i32 s36, s2, 0x1001c -; GFX6-NEXT: s_bfe_i32 s37, s3, 0x10003 -; GFX6-NEXT: s_bfe_i32 s38, s3, 0x10002 -; GFX6-NEXT: s_bfe_i32 s39, s3, 0x10001 -; GFX6-NEXT: s_bfe_i32 s40, s3, 0x10000 -; GFX6-NEXT: s_bfe_i32 s41, s3, 0x10007 -; GFX6-NEXT: s_bfe_i32 s42, s3, 0x10006 -; GFX6-NEXT: s_bfe_i32 s43, s3, 0x10005 -; GFX6-NEXT: s_bfe_i32 s44, s3, 0x10004 -; GFX6-NEXT: s_bfe_i32 s45, s3, 0x1000b -; GFX6-NEXT: s_bfe_i32 s46, s3, 0x1000a -; GFX6-NEXT: s_bfe_i32 s47, s3, 0x10009 -; GFX6-NEXT: s_bfe_i32 s48, s3, 0x10008 -; GFX6-NEXT: s_bfe_i32 s49, s3, 0x1000e -; GFX6-NEXT: s_bfe_i32 s50, s3, 0x1000d -; GFX6-NEXT: s_bfe_i32 s51, s3, 0x1000c -; GFX6-NEXT: s_bfe_i32 s52, s3, 0x10013 -; GFX6-NEXT: s_bfe_i32 s53, s3, 0x10012 -; GFX6-NEXT: s_bfe_i32 s54, s3, 0x10011 -; GFX6-NEXT: s_bfe_i32 s55, s3, 0x10010 -; GFX6-NEXT: s_bfe_i32 s56, s3, 0x10017 -; GFX6-NEXT: s_bfe_i32 s57, s3, 0x10016 -; GFX6-NEXT: s_bfe_i32 s58, s3, 0x10015 -; GFX6-NEXT: s_bfe_i32 s59, s3, 0x10014 -; GFX6-NEXT: s_bfe_i32 s60, s3, 0x1001b -; GFX6-NEXT: s_bfe_i32 s61, s3, 0x1001a -; GFX6-NEXT: s_bfe_i32 s62, s3, 0x10019 -; GFX6-NEXT: s_bfe_i32 s63, s3, 0x10018 -; GFX6-NEXT: s_ashr_i32 s64, s3, 31 -; GFX6-NEXT: s_bfe_i32 s65, s3, 0x1001e -; GFX6-NEXT: s_bfe_i32 s66, s3, 0x1001d -; GFX6-NEXT: s_bfe_i32 s67, s3, 0x1001c -; GFX6-NEXT: s_bfe_i32 s68, s3, 0x1000f +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v0, s67 -; GFX6-NEXT: v_mov_b32_e32 v1, s66 -; GFX6-NEXT: v_mov_b32_e32 v2, s65 -; GFX6-NEXT: v_mov_b32_e32 v3, s64 -; GFX6-NEXT: v_mov_b32_e32 v4, s63 -; GFX6-NEXT: v_mov_b32_e32 v5, s62 -; GFX6-NEXT: v_mov_b32_e32 v6, s61 -; GFX6-NEXT: v_mov_b32_e32 v7, s60 -; GFX6-NEXT: v_mov_b32_e32 v8, s59 -; GFX6-NEXT: v_mov_b32_e32 v9, s58 -; GFX6-NEXT: v_mov_b32_e32 v10, s57 -; GFX6-NEXT: v_mov_b32_e32 v11, s56 -; GFX6-NEXT: v_mov_b32_e32 v12, s55 -; GFX6-NEXT: v_mov_b32_e32 v13, s54 -; GFX6-NEXT: v_mov_b32_e32 v14, s53 -; GFX6-NEXT: v_mov_b32_e32 v15, s52 -; GFX6-NEXT: v_mov_b32_e32 v16, s51 -; GFX6-NEXT: v_mov_b32_e32 v17, s50 -; GFX6-NEXT: v_mov_b32_e32 v18, s49 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s6, s4, 0x10003 +; GFX6-NEXT: s_bfe_i32 s7, s4, 0x10002 +; GFX6-NEXT: s_bfe_i32 s8, s4, 0x10001 +; GFX6-NEXT: s_bfe_i32 s9, s4, 0x10000 +; GFX6-NEXT: s_bfe_i32 s10, s4, 0x10007 +; GFX6-NEXT: s_bfe_i32 s11, s4, 0x10006 +; GFX6-NEXT: s_bfe_i32 s12, s4, 0x10005 +; GFX6-NEXT: s_bfe_i32 s13, s4, 0x10004 +; GFX6-NEXT: s_bfe_i32 s14, s4, 0x1000b +; GFX6-NEXT: s_bfe_i32 s15, s4, 0x1000a +; GFX6-NEXT: s_bfe_i32 s16, s4, 0x10009 +; GFX6-NEXT: s_bfe_i32 s17, s4, 0x10008 +; GFX6-NEXT: s_bfe_i32 s18, s4, 0x1000f +; GFX6-NEXT: s_bfe_i32 s19, s4, 0x1000e +; GFX6-NEXT: s_bfe_i32 s20, s4, 0x1000d +; GFX6-NEXT: s_bfe_i32 s21, s4, 0x1000c +; GFX6-NEXT: s_bfe_i32 s22, s4, 0x10013 +; GFX6-NEXT: s_bfe_i32 s23, s4, 0x10012 +; GFX6-NEXT: s_bfe_i32 s24, s4, 0x10011 +; GFX6-NEXT: s_bfe_i32 s25, s4, 0x10010 +; GFX6-NEXT: s_bfe_i32 s26, s4, 0x10017 +; GFX6-NEXT: s_bfe_i32 s27, s4, 0x10016 +; GFX6-NEXT: s_bfe_i32 s28, s4, 0x10015 +; GFX6-NEXT: s_bfe_i32 s29, s4, 0x10014 +; GFX6-NEXT: s_bfe_i32 s30, s4, 0x1001b +; GFX6-NEXT: s_bfe_i32 s31, s4, 0x1001a +; GFX6-NEXT: s_bfe_i32 s33, s4, 0x10019 +; GFX6-NEXT: s_bfe_i32 s34, s4, 0x10018 +; GFX6-NEXT: s_ashr_i32 s35, s4, 31 +; GFX6-NEXT: s_bfe_i32 s36, s4, 0x1001e +; GFX6-NEXT: s_bfe_i32 s37, s4, 0x1001d +; GFX6-NEXT: s_bfe_i32 s4, s4, 0x1001c +; GFX6-NEXT: s_bfe_i32 s38, s5, 0x10003 +; GFX6-NEXT: s_bfe_i32 s39, s5, 0x10002 +; GFX6-NEXT: s_bfe_i32 s40, s5, 0x10001 +; GFX6-NEXT: s_bfe_i32 s41, s5, 0x10000 +; GFX6-NEXT: s_bfe_i32 s42, s5, 0x10007 +; GFX6-NEXT: s_bfe_i32 s43, s5, 0x1001c +; GFX6-NEXT: s_bfe_i32 s44, s5, 0x1001d +; GFX6-NEXT: v_mov_b32_e32 v0, s43 +; GFX6-NEXT: s_bfe_i32 s43, s5, 0x1001e +; GFX6-NEXT: v_mov_b32_e32 v1, s44 +; GFX6-NEXT: s_ashr_i32 s44, s5, 31 +; GFX6-NEXT: v_mov_b32_e32 v2, s43 +; GFX6-NEXT: s_bfe_i32 s43, s5, 0x10018 +; GFX6-NEXT: v_mov_b32_e32 v3, s44 +; GFX6-NEXT: s_bfe_i32 s44, s5, 0x10019 +; GFX6-NEXT: v_mov_b32_e32 v4, s43 +; GFX6-NEXT: s_bfe_i32 s43, s5, 0x1001a +; GFX6-NEXT: v_mov_b32_e32 v5, s44 +; GFX6-NEXT: s_bfe_i32 s44, s5, 0x1001b +; GFX6-NEXT: v_mov_b32_e32 v6, s43 +; GFX6-NEXT: s_bfe_i32 s43, s5, 0x10014 +; GFX6-NEXT: v_mov_b32_e32 v7, s44 +; GFX6-NEXT: s_bfe_i32 s44, s5, 0x10015 +; GFX6-NEXT: v_mov_b32_e32 v8, s43 +; GFX6-NEXT: s_bfe_i32 s43, s5, 0x10016 +; GFX6-NEXT: v_mov_b32_e32 v9, s44 +; GFX6-NEXT: s_bfe_i32 s44, s5, 0x10017 +; GFX6-NEXT: v_mov_b32_e32 v10, s43 +; GFX6-NEXT: s_bfe_i32 s43, s5, 0x10010 +; GFX6-NEXT: v_mov_b32_e32 v11, s44 +; GFX6-NEXT: s_bfe_i32 s44, s5, 0x10011 +; GFX6-NEXT: v_mov_b32_e32 v12, s43 +; GFX6-NEXT: s_bfe_i32 s43, s5, 0x10012 +; GFX6-NEXT: v_mov_b32_e32 v13, s44 +; GFX6-NEXT: s_bfe_i32 s44, s5, 0x10013 +; GFX6-NEXT: v_mov_b32_e32 v14, s43 +; GFX6-NEXT: s_bfe_i32 s43, s5, 0x1000c +; GFX6-NEXT: v_mov_b32_e32 v15, s44 +; GFX6-NEXT: s_bfe_i32 s44, s5, 0x1000d +; GFX6-NEXT: v_mov_b32_e32 v16, s43 +; GFX6-NEXT: s_bfe_i32 s43, s5, 0x1000e +; GFX6-NEXT: v_mov_b32_e32 v17, s44 +; GFX6-NEXT: s_bfe_i32 s44, s5, 0x1000f +; GFX6-NEXT: v_mov_b32_e32 v18, s43 +; GFX6-NEXT: s_bfe_i32 s43, s5, 0x10009 +; GFX6-NEXT: v_mov_b32_e32 v19, s44 +; GFX6-NEXT: s_bfe_i32 s44, s5, 0x10008 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s48 -; GFX6-NEXT: v_mov_b32_e32 v19, s68 -; GFX6-NEXT: v_mov_b32_e32 v1, s47 -; GFX6-NEXT: v_mov_b32_e32 v2, s46 -; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NEXT: s_bfe_i32 s44, s5, 0x1000a +; GFX6-NEXT: v_mov_b32_e32 v1, s43 +; GFX6-NEXT: s_bfe_i32 s43, s5, 0x1000b +; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: s_bfe_i32 s44, s5, 0x10006 +; GFX6-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NEXT: s_bfe_i32 s43, s5, 0x10005 +; GFX6-NEXT: s_bfe_i32 s5, s5, 0x10004 ; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v3, s42 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s39 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: v_mov_b32_e32 v0, s41 +; GFX6-NEXT: v_mov_b32_e32 v1, s40 +; GFX6-NEXT: v_mov_b32_e32 v2, s39 +; GFX6-NEXT: v_mov_b32_e32 v3, s38 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s36 -; GFX6-NEXT: v_mov_b32_e32 v1, s35 -; GFX6-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NEXT: v_mov_b32_e32 v3, s33 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s37 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s35 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s31 -; GFX6-NEXT: v_mov_b32_e32 v1, s30 -; GFX6-NEXT: v_mov_b32_e32 v2, s29 -; GFX6-NEXT: v_mov_b32_e32 v3, s28 +; GFX6-NEXT: v_mov_b32_e32 v0, s34 +; GFX6-NEXT: v_mov_b32_e32 v1, s33 +; GFX6-NEXT: v_mov_b32_e32 v2, s31 +; GFX6-NEXT: v_mov_b32_e32 v3, s30 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s27 -; GFX6-NEXT: v_mov_b32_e32 v1, s26 -; GFX6-NEXT: v_mov_b32_e32 v2, s25 -; GFX6-NEXT: v_mov_b32_e32 v3, s24 +; GFX6-NEXT: v_mov_b32_e32 v0, s29 +; GFX6-NEXT: v_mov_b32_e32 v1, s28 +; GFX6-NEXT: v_mov_b32_e32 v2, s27 +; GFX6-NEXT: v_mov_b32_e32 v3, s26 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s23 -; GFX6-NEXT: v_mov_b32_e32 v1, s22 -; GFX6-NEXT: v_mov_b32_e32 v2, s21 -; GFX6-NEXT: v_mov_b32_e32 v3, s20 +; GFX6-NEXT: v_mov_b32_e32 v0, s25 +; GFX6-NEXT: v_mov_b32_e32 v1, s24 +; GFX6-NEXT: v_mov_b32_e32 v2, s23 +; GFX6-NEXT: v_mov_b32_e32 v3, s22 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s19 -; GFX6-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NEXT: v_mov_b32_e32 v2, s17 -; GFX6-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NEXT: v_mov_b32_e32 v0, s21 +; GFX6-NEXT: v_mov_b32_e32 v1, s20 +; GFX6-NEXT: v_mov_b32_e32 v2, s19 +; GFX6-NEXT: v_mov_b32_e32 v3, s18 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s15 -; GFX6-NEXT: v_mov_b32_e32 v1, s14 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 -; GFX6-NEXT: v_mov_b32_e32 v3, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s17 +; GFX6-NEXT: v_mov_b32_e32 v1, s16 +; GFX6-NEXT: v_mov_b32_e32 v2, s15 +; GFX6-NEXT: v_mov_b32_e32 v3, s14 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s11 -; GFX6-NEXT: v_mov_b32_e32 v1, s10 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 +; GFX6-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NEXT: v_mov_b32_e32 v1, s12 +; GFX6-NEXT: v_mov_b32_e32 v2, s11 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s7 -; GFX6-NEXT: v_mov_b32_e32 v1, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -6020,40 +6020,40 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: s_add_u32 s4, s0, 32 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v18, s3 +; GFX8-NEXT: v_mov_b32_e32 v17, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NEXT: v_mov_b32_e32 v16, s1 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s0 +; GFX8-NEXT: s_add_u32 s0, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s5 +; GFX8-NEXT: v_mov_b32_e32 v20, s3 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: v_mov_b32_e32 v15, s4 +; GFX8-NEXT: v_mov_b32_e32 v19, s2 +; GFX8-NEXT: v_mov_b32_e32 v22, s1 ; GFX8-NEXT: v_mov_b32_e32 v8, v1 ; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_mov_b32_e32 v21, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v12, v1 ; GFX8-NEXT: v_mov_b32_e32 v14, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_u32 v6, v0, 5, 1 ; GFX8-NEXT: v_bfe_u32 v4, v0, 4, 1 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[4:7] -; GFX8-NEXT: v_mov_b32_e32 v16, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v18, s1 -; GFX8-NEXT: v_mov_b32_e32 v17, s0 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; GFX8-NEXT: v_and_b32_e32 v23, 0xffff, v0 +; GFX8-NEXT: flat_store_dwordx4 v[19:20], v[4:7] ; GFX8-NEXT: v_bfe_u32 v9, v0, 3, 1 ; GFX8-NEXT: v_bfe_u32 v7, v0, 2, 1 -; GFX8-NEXT: v_mov_b32_e32 v15, s2 ; GFX8-NEXT: v_bfe_u32 v13, v0, 1, 1 ; GFX8-NEXT: v_and_b32_e32 v11, 1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 7, v6 -; GFX8-NEXT: v_bfe_u32 v0, v6, 6, 1 -; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[7:10] -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[11:14] +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 7, v23 +; GFX8-NEXT: v_bfe_u32 v0, v23, 6, 1 +; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[7:10] +; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v8i1_to_v8i64: @@ -6441,22 +6441,22 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_bfe_u32 v27, v29, 5, 1 ; GFX6-NEXT: v_bfe_u32 v2, v29, 11, 1 ; GFX6-NEXT: v_bfe_u32 v0, v29, 10, 1 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NEXT: v_bfe_u32 v23, v29, 7, 1 +; GFX6-NEXT: v_bfe_u32 v19, v29, 1, 1 ; GFX6-NEXT: v_bfe_u32 v5, v29, 9, 1 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_bfe_u32 v3, v29, 8, 1 ; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:64 +; GFX6-NEXT: v_bfe_u32 v15, v29, 3, 1 +; GFX6-NEXT: v_bfe_u32 v11, v29, 13, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 15, v29 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_bfe_u32 v6, v29, 14, 1 ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GFX6-NEXT: v_bfe_u32 v27, v29, 5, 1 -; GFX6-NEXT: v_bfe_u32 v23, v29, 7, 1 -; GFX6-NEXT: v_bfe_u32 v19, v29, 1, 1 -; GFX6-NEXT: v_bfe_u32 v15, v29, 3, 1 -; GFX6-NEXT: v_bfe_u32 v11, v29, 13, 1 ; GFX6-NEXT: v_bfe_u32 v25, v29, 4, 1 ; GFX6-NEXT: v_bfe_u32 v21, v29, 6, 1 ; GFX6-NEXT: v_and_b32_e32 v17, 1, v29 @@ -6752,68 +6752,68 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s8, s6 ; GFX6-NEXT: s_mov_b32 s9, s7 -; GFX6-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; GFX6-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 15, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 13, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v11, 10, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v12, 11, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v14, 8, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v16, 9, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v15, 6, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v9, 4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 5, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 2, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 3, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v13, 1, v1 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 15, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 12, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 13, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 10, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v13, 11, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v15, 8, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v16, 9, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v12, 6, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v14, 7, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 5, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 1 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 1 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX6-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:112 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_bfe_i32 v6, v10, 0, 1 -; GFX6-NEXT: v_bfe_i32 v4, v9, 0, 1 -; GFX6-NEXT: v_bfe_i32 v9, v8, 0, 1 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 1 -; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:96 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_bfe_i32 v9, v12, 0, 1 -; GFX6-NEXT: v_bfe_i32 v7, v11, 0, 1 -; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 1 -; GFX6-NEXT: v_bfe_i32 v11, v1, 0, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 7, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80 -; GFX6-NEXT: v_bfe_i32 v17, v1, 0, 1 -; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 1 -; GFX6-NEXT: v_bfe_i32 v21, v16, 0, 1 -; GFX6-NEXT: v_bfe_i32 v19, v14, 0, 1 -; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 1 +; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 1 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 1 +; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 1 +; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 1 +; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 1 +; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 1 +; GFX6-NEXT: v_bfe_i32 v18, v16, 0, 1 +; GFX6-NEXT: v_bfe_i32 v16, v15, 0, 1 +; GFX6-NEXT: v_bfe_i32 v21, v13, 0, 1 +; GFX6-NEXT: v_bfe_i32 v19, v11, 0, 1 +; GFX6-NEXT: v_bfe_i32 v24, v9, 0, 1 +; GFX6-NEXT: v_bfe_i32 v22, v7, 0, 1 +; GFX6-NEXT: v_bfe_i32 v27, v5, 0, 1 +; GFX6-NEXT: v_bfe_i32 v25, v3, 0, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v28, 31, v27 +; GFX6-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GFX6-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX6-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GFX6-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:96 +; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; GFX6-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GFX6-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:64 -; GFX6-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:80 +; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v16i1_to_v16i64: @@ -6862,69 +6862,69 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x60 +; GFX8-NEXT: v_mov_b32_e32 v17, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v16, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x60 +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[2:5] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[2:5] -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: v_mov_b32_e32 v7, s7 ; GFX8-NEXT: v_mov_b32_e32 v8, s8 ; GFX8-NEXT: v_mov_b32_e32 v9, s9 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_add_u32 s2, s0, 0x50 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[6:9] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[6:9] -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v10, s10 ; GFX8-NEXT: v_mov_b32_e32 v11, s11 ; GFX8-NEXT: v_mov_b32_e32 v12, s12 ; GFX8-NEXT: v_mov_b32_e32 v13, s13 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_add_u32 s2, s0, 64 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[10:13] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: v_mov_b32_e32 v5, s17 +; GFX8-NEXT: v_mov_b32_e32 v14, s14 +; GFX8-NEXT: v_mov_b32_e32 v15, s15 +; GFX8-NEXT: v_mov_b32_e32 v16, s16 +; GFX8-NEXT: v_mov_b32_e32 v17, s17 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[2:5] -; GFX8-NEXT: v_mov_b32_e32 v6, s18 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[14:17] +; GFX8-NEXT: v_mov_b32_e32 v4, s18 +; GFX8-NEXT: v_mov_b32_e32 v17, s3 +; GFX8-NEXT: v_mov_b32_e32 v16, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v7, s19 -; GFX8-NEXT: v_mov_b32_e32 v8, s20 -; GFX8-NEXT: v_mov_b32_e32 v9, s21 +; GFX8-NEXT: v_mov_b32_e32 v5, s19 +; GFX8-NEXT: v_mov_b32_e32 v6, s20 +; GFX8-NEXT: v_mov_b32_e32 v7, s21 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[6:9] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[4:7] ; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v10, s22 -; GFX8-NEXT: v_mov_b32_e32 v11, s23 -; GFX8-NEXT: v_mov_b32_e32 v12, s24 -; GFX8-NEXT: v_mov_b32_e32 v13, s25 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v8, s22 +; GFX8-NEXT: v_mov_b32_e32 v9, s23 +; GFX8-NEXT: v_mov_b32_e32 v10, s24 +; GFX8-NEXT: v_mov_b32_e32 v11, s25 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[10:13] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[8:11] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v14, s26 -; GFX8-NEXT: v_mov_b32_e32 v15, s27 -; GFX8-NEXT: v_mov_b32_e32 v16, s28 -; GFX8-NEXT: v_mov_b32_e32 v17, s29 +; GFX8-NEXT: v_mov_b32_e32 v12, s26 +; GFX8-NEXT: v_mov_b32_e32 v13, s27 +; GFX8-NEXT: v_mov_b32_e32 v14, s28 +; GFX8-NEXT: v_mov_b32_e32 v15, s29 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s30 ; GFX8-NEXT: v_mov_b32_e32 v3, s31 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[14:17] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[12:15] ; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -7799,156 +7799,155 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s38, s4, 30 -; GFX6-NEXT: s_lshr_b32 s40, s4, 31 -; GFX6-NEXT: s_lshr_b32 s34, s4, 28 -; GFX6-NEXT: s_lshr_b32 s36, s4, 29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 26 -; GFX6-NEXT: s_lshr_b32 s30, s4, 27 -; GFX6-NEXT: s_lshr_b32 s24, s4, 24 -; GFX6-NEXT: s_lshr_b32 s26, s4, 25 -; GFX6-NEXT: s_lshr_b32 s20, s4, 22 -; GFX6-NEXT: s_lshr_b32 s22, s4, 23 -; GFX6-NEXT: s_lshr_b32 s18, s4, 20 -; GFX6-NEXT: s_lshr_b32 s6, s4, 21 -; GFX6-NEXT: s_lshr_b32 s8, s4, 18 -; GFX6-NEXT: s_lshr_b32 s10, s4, 19 -; GFX6-NEXT: s_lshr_b32 s12, s4, 16 -; GFX6-NEXT: s_lshr_b32 s14, s4, 17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 14 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 -; GFX6-NEXT: s_lshr_b32 s42, s4, 15 -; GFX6-NEXT: v_mov_b32_e32 v0, s44 -; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: s_lshr_b32 s44, s4, 12 +; GFX6-NEXT: s_lshr_b32 s50, s4, 30 +; GFX6-NEXT: s_lshr_b32 s48, s4, 31 +; GFX6-NEXT: s_lshr_b32 s44, s4, 28 +; GFX6-NEXT: s_lshr_b32 s42, s4, 29 +; GFX6-NEXT: s_lshr_b32 s38, s4, 26 +; GFX6-NEXT: s_lshr_b32 s36, s4, 27 +; GFX6-NEXT: s_lshr_b32 s34, s4, 24 +; GFX6-NEXT: s_lshr_b32 s30, s4, 25 +; GFX6-NEXT: s_lshr_b32 s26, s4, 22 +; GFX6-NEXT: s_lshr_b32 s20, s4, 23 +; GFX6-NEXT: s_lshr_b32 s16, s4, 20 +; GFX6-NEXT: s_lshr_b32 s12, s4, 21 +; GFX6-NEXT: s_lshr_b32 s18, s4, 18 +; GFX6-NEXT: s_lshr_b32 s24, s4, 19 +; GFX6-NEXT: s_lshr_b32 s40, s4, 16 +; GFX6-NEXT: s_lshr_b32 s46, s4, 17 +; GFX6-NEXT: s_lshr_b32 s52, s4, 14 +; GFX6-NEXT: s_lshr_b32 s54, s4, 15 +; GFX6-NEXT: s_lshr_b32 s56, s4, 12 +; GFX6-NEXT: s_lshr_b32 s58, s4, 13 +; GFX6-NEXT: s_lshr_b32 s60, s4, 10 +; GFX6-NEXT: s_lshr_b32 s62, s4, 11 +; GFX6-NEXT: s_lshr_b32 s64, s4, 8 +; GFX6-NEXT: s_lshr_b32 s66, s4, 9 +; GFX6-NEXT: s_lshr_b32 s28, s4, 6 +; GFX6-NEXT: s_lshr_b32 s22, s4, 7 +; GFX6-NEXT: s_lshr_b32 s14, s4, 4 +; GFX6-NEXT: s_lshr_b32 s10, s4, 5 +; GFX6-NEXT: s_lshr_b32 s8, s4, 2 +; GFX6-NEXT: s_lshr_b32 s6, s4, 3 +; GFX6-NEXT: s_lshr_b32 s70, s4, 1 +; GFX6-NEXT: s_bfe_i64 s[68:69], s[4:5], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[4:5], s[70:71], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s68 +; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v1, s69 +; GFX6-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s50 +; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v3, s51 +; GFX6-NEXT: s_bfe_i64 s[50:51], s[64:65], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v4, s48 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v5, s49 +; GFX6-NEXT: s_bfe_i64 s[48:49], s[62:63], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v6, s44 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v7, s45 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[60:61], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s42 ; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 13 -; GFX6-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NEXT: v_mov_b32_e32 v5, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 10 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v9, s43 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[58:59], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v10, s38 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v6, s34 -; GFX6-NEXT: v_mov_b32_e32 v7, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 11 -; GFX6-NEXT: v_mov_b32_e32 v8, s36 -; GFX6-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 8 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v11, s39 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[56:57], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s36 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v13, s37 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[54:55], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v14, s34 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v10, s28 -; GFX6-NEXT: v_mov_b32_e32 v11, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 9 -; GFX6-NEXT: v_mov_b32_e32 v12, s30 -; GFX6-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 6 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v15, s35 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[52:53], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v16, s30 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v14, s24 -; GFX6-NEXT: v_mov_b32_e32 v15, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 7 -; GFX6-NEXT: v_mov_b32_e32 v16, s26 -; GFX6-NEXT: v_mov_b32_e32 v17, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 4 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v17, s31 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[46:47], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v18, s26 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v19, s27 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:240 +; GFX6-NEXT: v_mov_b32_e32 v20, s20 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v21, s21 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[24:25], 0x10000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v4, s22 -; GFX6-NEXT: v_mov_b32_e32 v5, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 2 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s18 -; GFX6-NEXT: v_mov_b32_e32 v7, s19 -; GFX6-NEXT: s_lshr_b32 s18, s4, 3 -; GFX6-NEXT: s_lshr_b32 s4, s4, 1 -; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224 ; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208 ; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:192 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 -; GFX6-NEXT: v_mov_b32_e32 v8, s6 -; GFX6-NEXT: v_mov_b32_e32 v9, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160 -; GFX6-NEXT: s_waitcnt expcnt(1) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NEXT: v_mov_b32_e32 v4, s10 -; GFX6-NEXT: v_mov_b32_e32 v5, s11 +; GFX6-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:176 +; GFX6-NEXT: v_mov_b32_e32 v4, s12 +; GFX6-NEXT: v_mov_b32_e32 v5, s13 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NEXT: v_mov_b32_e32 v4, s20 +; GFX6-NEXT: v_mov_b32_e32 v5, s21 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NEXT: v_mov_b32_e32 v3, s13 -; GFX6-NEXT: v_mov_b32_e32 v4, s14 -; GFX6-NEXT: v_mov_b32_e32 v5, s15 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NEXT: v_mov_b32_e32 v4, s30 +; GFX6-NEXT: v_mov_b32_e32 v5, s31 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NEXT: v_mov_b32_e32 v4, s36 +; GFX6-NEXT: v_mov_b32_e32 v5, s37 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 ; GFX6-NEXT: v_mov_b32_e32 v4, s42 ; GFX6-NEXT: v_mov_b32_e32 v5, s43 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s44 ; GFX6-NEXT: v_mov_b32_e32 v3, s45 -; GFX6-NEXT: v_mov_b32_e32 v4, s38 -; GFX6-NEXT: v_mov_b32_e32 v5, s39 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:96 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: v_mov_b32_e32 v4, s34 -; GFX6-NEXT: v_mov_b32_e32 v5, s35 +; GFX6-NEXT: v_mov_b32_e32 v4, s48 +; GFX6-NEXT: v_mov_b32_e32 v5, s49 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NEXT: v_mov_b32_e32 v4, s28 -; GFX6-NEXT: v_mov_b32_e32 v5, s29 +; GFX6-NEXT: v_mov_b32_e32 v2, s50 +; GFX6-NEXT: v_mov_b32_e32 v3, s51 +; GFX6-NEXT: v_mov_b32_e32 v4, s66 +; GFX6-NEXT: v_mov_b32_e32 v5, s67 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s30 -; GFX6-NEXT: v_mov_b32_e32 v3, s31 -; GFX6-NEXT: v_mov_b32_e32 v4, s24 -; GFX6-NEXT: v_mov_b32_e32 v5, s25 +; GFX6-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NEXT: v_mov_b32_e32 v4, s22 +; GFX6-NEXT: v_mov_b32_e32 v5, s23 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: v_mov_b32_e32 v4, s20 -; GFX6-NEXT: v_mov_b32_e32 v5, s21 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NEXT: v_mov_b32_e32 v5, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s22 -; GFX6-NEXT: v_mov_b32_e32 v3, s23 -; GFX6-NEXT: v_mov_b32_e32 v4, s18 -; GFX6-NEXT: v_mov_b32_e32 v5, s19 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: v_mov_b32_e32 v5, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -8638,202 +8637,202 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_u32 s4, s2, 0x10003 -; GFX6-NEXT: s_bfe_u32 s5, s2, 0x10005 -; GFX6-NEXT: s_bfe_u32 s8, s2, 0x10007 -; GFX6-NEXT: s_bfe_u32 s11, s2, 0x10009 -; GFX6-NEXT: s_bfe_u32 s13, s2, 0x1000b -; GFX6-NEXT: s_bfe_u32 s15, s2, 0x1000d -; GFX6-NEXT: s_bfe_u32 s17, s2, 0x1000f -; GFX6-NEXT: s_bfe_u32 s19, s2, 0x10011 -; GFX6-NEXT: s_bfe_u32 s21, s2, 0x10013 -; GFX6-NEXT: s_bfe_u32 s23, s2, 0x10015 -; GFX6-NEXT: s_bfe_u32 s25, s2, 0x10017 -; GFX6-NEXT: s_bfe_u32 s27, s2, 0x10019 -; GFX6-NEXT: s_bfe_u32 s29, s2, 0x1001b -; GFX6-NEXT: s_bfe_u32 s31, s2, 0x1001d -; GFX6-NEXT: s_lshr_b32 s34, s2, 31 -; GFX6-NEXT: s_bfe_u32 s35, s3, 0x10003 -; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10005 -; GFX6-NEXT: s_bfe_u32 s37, s3, 0x10007 -; GFX6-NEXT: s_bfe_u32 s38, s3, 0x10009 -; GFX6-NEXT: s_bfe_u32 s39, s3, 0x1000b -; GFX6-NEXT: s_bfe_u32 s40, s3, 0x1000d -; GFX6-NEXT: s_bfe_u32 s41, s3, 0x1000f -; GFX6-NEXT: s_bfe_u32 s42, s3, 0x10011 -; GFX6-NEXT: s_bfe_u32 s43, s3, 0x10013 -; GFX6-NEXT: s_bfe_u32 s44, s3, 0x10015 -; GFX6-NEXT: s_bfe_u32 s45, s3, 0x10017 -; GFX6-NEXT: s_bfe_u32 s46, s3, 0x10019 -; GFX6-NEXT: s_bfe_u32 s47, s3, 0x1001b -; GFX6-NEXT: s_bfe_u32 s48, s3, 0x1001d -; GFX6-NEXT: s_lshr_b32 s49, s3, 31 -; GFX6-NEXT: s_bfe_u32 s9, s3, 0x10001 -; GFX6-NEXT: s_bfe_u32 s6, s2, 0x10001 -; GFX6-NEXT: s_and_b32 s7, s2, 1 -; GFX6-NEXT: s_and_b32 s10, s3, 1 -; GFX6-NEXT: s_bfe_u32 s12, s2, 0x10002 -; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10004 -; GFX6-NEXT: s_bfe_u32 s16, s2, 0x10006 -; GFX6-NEXT: s_bfe_u32 s18, s2, 0x10008 -; GFX6-NEXT: s_bfe_u32 s20, s2, 0x1000a -; GFX6-NEXT: s_bfe_u32 s22, s2, 0x1000c -; GFX6-NEXT: s_bfe_u32 s24, s2, 0x1000e -; GFX6-NEXT: s_bfe_u32 s26, s2, 0x10010 -; GFX6-NEXT: s_bfe_u32 s28, s2, 0x10012 -; GFX6-NEXT: s_bfe_u32 s30, s2, 0x10014 -; GFX6-NEXT: s_bfe_u32 s33, s2, 0x10016 -; GFX6-NEXT: s_bfe_u32 s50, s2, 0x10018 -; GFX6-NEXT: s_bfe_u32 s51, s2, 0x1001a -; GFX6-NEXT: s_bfe_u32 s52, s2, 0x1001c -; GFX6-NEXT: s_bfe_u32 s53, s2, 0x1001e -; GFX6-NEXT: s_bfe_u32 s54, s3, 0x10002 -; GFX6-NEXT: s_bfe_u32 s55, s3, 0x10004 -; GFX6-NEXT: s_bfe_u32 s56, s3, 0x10006 -; GFX6-NEXT: s_bfe_u32 s57, s3, 0x10008 -; GFX6-NEXT: s_bfe_u32 s58, s3, 0x1000a -; GFX6-NEXT: s_bfe_u32 s59, s3, 0x1000c -; GFX6-NEXT: s_bfe_u32 s60, s3, 0x1000e -; GFX6-NEXT: s_bfe_u32 s61, s3, 0x10010 -; GFX6-NEXT: s_bfe_u32 s62, s3, 0x10012 -; GFX6-NEXT: s_bfe_u32 s63, s3, 0x10014 -; GFX6-NEXT: s_bfe_u32 s64, s3, 0x10016 -; GFX6-NEXT: s_bfe_u32 s65, s3, 0x10018 -; GFX6-NEXT: s_bfe_u32 s66, s3, 0x1001a -; GFX6-NEXT: s_bfe_u32 s67, s3, 0x1001e -; GFX6-NEXT: s_bfe_u32 s68, s3, 0x1001c +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mov_b32_e32 v0, s67 -; GFX6-NEXT: v_mov_b32_e32 v2, s49 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_u32 s6, s4, 0x10003 +; GFX6-NEXT: s_bfe_u32 s7, s4, 0x10005 +; GFX6-NEXT: s_bfe_u32 s10, s4, 0x10007 +; GFX6-NEXT: s_bfe_u32 s13, s4, 0x10009 +; GFX6-NEXT: s_bfe_u32 s15, s4, 0x1000b +; GFX6-NEXT: s_bfe_u32 s17, s4, 0x1000d +; GFX6-NEXT: s_bfe_u32 s19, s4, 0x1000f +; GFX6-NEXT: s_bfe_u32 s21, s4, 0x10011 +; GFX6-NEXT: s_bfe_u32 s22, s4, 0x10013 +; GFX6-NEXT: s_bfe_u32 s24, s4, 0x10015 +; GFX6-NEXT: s_bfe_u32 s26, s4, 0x10017 +; GFX6-NEXT: s_bfe_u32 s27, s4, 0x10019 +; GFX6-NEXT: s_bfe_u32 s28, s4, 0x1001b +; GFX6-NEXT: s_bfe_u32 s29, s4, 0x1001d +; GFX6-NEXT: s_lshr_b32 s30, s4, 31 +; GFX6-NEXT: s_bfe_u32 s31, s5, 0x10003 +; GFX6-NEXT: s_bfe_u32 s33, s5, 0x10005 +; GFX6-NEXT: s_bfe_u32 s34, s5, 0x10007 +; GFX6-NEXT: s_bfe_u32 s35, s5, 0x10009 +; GFX6-NEXT: s_bfe_u32 s36, s5, 0x1000b +; GFX6-NEXT: s_bfe_u32 s37, s5, 0x1000d +; GFX6-NEXT: s_bfe_u32 s38, s5, 0x1000f +; GFX6-NEXT: s_bfe_u32 s39, s5, 0x10011 +; GFX6-NEXT: s_bfe_u32 s40, s5, 0x10013 +; GFX6-NEXT: s_bfe_u32 s41, s5, 0x10015 +; GFX6-NEXT: s_bfe_u32 s42, s5, 0x10017 +; GFX6-NEXT: s_bfe_u32 s43, s5, 0x10019 +; GFX6-NEXT: s_bfe_u32 s44, s5, 0x1001b +; GFX6-NEXT: s_bfe_u32 s25, s5, 0x1001d +; GFX6-NEXT: s_lshr_b32 s23, s5, 31 +; GFX6-NEXT: s_bfe_u32 s11, s5, 0x10001 +; GFX6-NEXT: s_bfe_u32 s8, s4, 0x10001 +; GFX6-NEXT: s_and_b32 s9, s4, 1 +; GFX6-NEXT: s_and_b32 s12, s5, 1 +; GFX6-NEXT: s_bfe_u32 s14, s4, 0x10002 +; GFX6-NEXT: s_bfe_u32 s16, s4, 0x10004 +; GFX6-NEXT: s_bfe_u32 s20, s5, 0x1001e +; GFX6-NEXT: s_bfe_u32 s18, s4, 0x10006 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: s_bfe_u32 s20, s4, 0x10008 +; GFX6-NEXT: v_mov_b32_e32 v2, s23 +; GFX6-NEXT: s_bfe_u32 s23, s5, 0x1001c ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s68 -; GFX6-NEXT: v_mov_b32_e32 v2, s48 +; GFX6-NEXT: v_mov_b32_e32 v0, s23 +; GFX6-NEXT: s_bfe_u32 s23, s4, 0x1000a +; GFX6-NEXT: v_mov_b32_e32 v2, s25 +; GFX6-NEXT: s_bfe_u32 s25, s5, 0x1001a ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NEXT: v_mov_b32_e32 v2, s47 +; GFX6-NEXT: v_mov_b32_e32 v0, s25 +; GFX6-NEXT: s_bfe_u32 s25, s4, 0x1000c +; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: s_bfe_u32 s44, s5, 0x10018 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:464 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s65 -; GFX6-NEXT: v_mov_b32_e32 v2, s46 +; GFX6-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NEXT: s_bfe_u32 s44, s4, 0x1000e +; GFX6-NEXT: v_mov_b32_e32 v2, s43 +; GFX6-NEXT: s_bfe_u32 s43, s5, 0x10016 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:448 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s64 -; GFX6-NEXT: v_mov_b32_e32 v2, s45 +; GFX6-NEXT: v_mov_b32_e32 v0, s43 +; GFX6-NEXT: s_bfe_u32 s43, s4, 0x10010 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: s_bfe_u32 s42, s5, 0x10014 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s63 -; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v0, s42 +; GFX6-NEXT: s_bfe_u32 s42, s4, 0x10012 +; GFX6-NEXT: v_mov_b32_e32 v2, s41 +; GFX6-NEXT: s_bfe_u32 s41, s5, 0x10012 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:416 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s62 -; GFX6-NEXT: v_mov_b32_e32 v2, s43 +; GFX6-NEXT: v_mov_b32_e32 v0, s41 +; GFX6-NEXT: s_bfe_u32 s41, s4, 0x10014 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: s_bfe_u32 s40, s5, 0x10010 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s61 -; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v0, s40 +; GFX6-NEXT: s_bfe_u32 s40, s4, 0x10016 +; GFX6-NEXT: v_mov_b32_e32 v2, s39 +; GFX6-NEXT: s_bfe_u32 s39, s5, 0x1000e ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s60 -; GFX6-NEXT: v_mov_b32_e32 v2, s41 +; GFX6-NEXT: v_mov_b32_e32 v0, s39 +; GFX6-NEXT: s_bfe_u32 s39, s4, 0x10018 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: s_bfe_u32 s38, s5, 0x1000c ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s59 -; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v0, s38 +; GFX6-NEXT: s_bfe_u32 s38, s4, 0x1001a +; GFX6-NEXT: v_mov_b32_e32 v2, s37 +; GFX6-NEXT: s_bfe_u32 s37, s5, 0x1000a ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s58 -; GFX6-NEXT: v_mov_b32_e32 v2, s39 +; GFX6-NEXT: v_mov_b32_e32 v0, s37 +; GFX6-NEXT: s_bfe_u32 s37, s4, 0x1001c +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x1001e +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: s_bfe_u32 s36, s5, 0x10008 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s57 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v0, s36 +; GFX6-NEXT: s_bfe_u32 s36, s5, 0x10002 +; GFX6-NEXT: v_mov_b32_e32 v2, s35 +; GFX6-NEXT: s_bfe_u32 s35, s5, 0x10004 +; GFX6-NEXT: s_bfe_u32 s5, s5, 0x10006 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s56 -; GFX6-NEXT: v_mov_b32_e32 v2, s37 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s34 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s55 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v0, s35 +; GFX6-NEXT: v_mov_b32_e32 v2, s33 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s54 -; GFX6-NEXT: v_mov_b32_e32 v2, s35 +; GFX6-NEXT: v_mov_b32_e32 v0, s36 +; GFX6-NEXT: v_mov_b32_e32 v2, s31 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s53 -; GFX6-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s30 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s52 -; GFX6-NEXT: v_mov_b32_e32 v2, s31 +; GFX6-NEXT: v_mov_b32_e32 v0, s37 +; GFX6-NEXT: v_mov_b32_e32 v2, s29 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s51 -; GFX6-NEXT: v_mov_b32_e32 v2, s29 +; GFX6-NEXT: v_mov_b32_e32 v0, s38 +; GFX6-NEXT: v_mov_b32_e32 v2, s28 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s50 +; GFX6-NEXT: v_mov_b32_e32 v0, s39 ; GFX6-NEXT: v_mov_b32_e32 v2, s27 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s33 -; GFX6-NEXT: v_mov_b32_e32 v2, s25 +; GFX6-NEXT: v_mov_b32_e32 v0, s40 +; GFX6-NEXT: v_mov_b32_e32 v2, s26 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s30 -; GFX6-NEXT: v_mov_b32_e32 v2, s23 +; GFX6-NEXT: v_mov_b32_e32 v0, s41 +; GFX6-NEXT: v_mov_b32_e32 v2, s24 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s28 -; GFX6-NEXT: v_mov_b32_e32 v2, s21 +; GFX6-NEXT: v_mov_b32_e32 v0, s42 +; GFX6-NEXT: v_mov_b32_e32 v2, s22 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s26 -; GFX6-NEXT: v_mov_b32_e32 v2, s19 +; GFX6-NEXT: v_mov_b32_e32 v0, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s21 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s24 -; GFX6-NEXT: v_mov_b32_e32 v2, s17 +; GFX6-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NEXT: v_mov_b32_e32 v2, s19 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s22 -; GFX6-NEXT: v_mov_b32_e32 v2, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s25 +; GFX6-NEXT: v_mov_b32_e32 v2, s17 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s23 +; GFX6-NEXT: v_mov_b32_e32 v2, s15 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: v_mov_b32_e32 v2, s11 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: v_mov_b32_e32 v2, s13 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v2, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v2, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s7 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -9842,752 +9841,867 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s42, s5, 30 -; GFX6-NEXT: s_lshr_b32 s36, s4, 30 -; GFX6-NEXT: s_lshr_b32 s38, s4, 31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 28 -; GFX6-NEXT: s_lshr_b32 s34, s4, 29 -; GFX6-NEXT: s_lshr_b32 s26, s4, 26 -; GFX6-NEXT: s_lshr_b32 s28, s4, 27 -; GFX6-NEXT: s_lshr_b32 s22, s4, 24 -; GFX6-NEXT: s_lshr_b32 s24, s4, 25 -; GFX6-NEXT: s_lshr_b32 s18, s4, 22 -; GFX6-NEXT: s_lshr_b32 s20, s4, 23 -; GFX6-NEXT: s_lshr_b32 s14, s4, 20 -; GFX6-NEXT: s_lshr_b32 s16, s4, 21 -; GFX6-NEXT: s_lshr_b32 s10, s4, 18 -; GFX6-NEXT: s_lshr_b32 s12, s4, 19 -; GFX6-NEXT: s_lshr_b32 s6, s4, 16 -; GFX6-NEXT: s_lshr_b32 s8, s4, 17 -; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v4, s7 -; GFX6-NEXT: s_lshr_b32 s40, s4, 14 -; GFX6-NEXT: v_mov_b32_e32 v0, s44 -; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: s_mov_b32 s44, s5 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v6, s44 -; GFX6-NEXT: v_mov_b32_e32 v7, s45 -; GFX6-NEXT: s_lshr_b32 s44, s4, 15 -; GFX6-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NEXT: v_mov_b32_e32 v3, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 12 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v8, s36 -; GFX6-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NEXT: s_lshr_b32 s76, s5, 30 +; GFX6-NEXT: s_lshr_b32 s74, s4, 30 +; GFX6-NEXT: s_lshr_b32 s72, s4, 31 +; GFX6-NEXT: s_lshr_b32 s70, s4, 28 +; GFX6-NEXT: s_lshr_b32 s68, s4, 29 +; GFX6-NEXT: s_lshr_b32 s66, s4, 26 +; GFX6-NEXT: s_lshr_b32 s64, s4, 27 +; GFX6-NEXT: s_lshr_b32 s62, s4, 24 +; GFX6-NEXT: s_lshr_b32 s60, s4, 25 +; GFX6-NEXT: s_lshr_b32 s58, s4, 22 +; GFX6-NEXT: s_lshr_b32 s56, s4, 23 +; GFX6-NEXT: s_lshr_b32 s54, s4, 20 +; GFX6-NEXT: s_lshr_b32 s52, s4, 21 +; GFX6-NEXT: s_lshr_b32 s50, s4, 18 +; GFX6-NEXT: s_lshr_b32 s48, s4, 19 +; GFX6-NEXT: s_lshr_b32 s46, s4, 16 +; GFX6-NEXT: s_lshr_b32 s44, s4, 17 +; GFX6-NEXT: s_lshr_b32 s42, s4, 14 +; GFX6-NEXT: s_lshr_b32 s40, s4, 15 +; GFX6-NEXT: s_lshr_b32 s38, s4, 12 ; GFX6-NEXT: s_lshr_b32 s36, s4, 13 -; GFX6-NEXT: v_mov_b32_e32 v10, s38 -; GFX6-NEXT: v_mov_b32_e32 v11, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 10 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v12, s30 -; GFX6-NEXT: v_mov_b32_e32 v13, s31 +; GFX6-NEXT: s_lshr_b32 s34, s4, 10 ; GFX6-NEXT: s_lshr_b32 s30, s4, 11 -; GFX6-NEXT: v_mov_b32_e32 v14, s34 -; GFX6-NEXT: v_mov_b32_e32 v15, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 8 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_lshr_b32 s28, s4, 8 +; GFX6-NEXT: s_lshr_b32 s26, s4, 9 +; GFX6-NEXT: s_lshr_b32 s24, s4, 6 +; GFX6-NEXT: s_lshr_b32 s22, s4, 7 +; GFX6-NEXT: s_lshr_b32 s20, s4, 4 +; GFX6-NEXT: s_lshr_b32 s18, s4, 5 +; GFX6-NEXT: s_lshr_b32 s16, s4, 2 +; GFX6-NEXT: s_lshr_b32 s14, s4, 3 +; GFX6-NEXT: s_lshr_b32 s10, s4, 1 +; GFX6-NEXT: s_lshr_b32 s6, s5, 29 +; GFX6-NEXT: s_lshr_b32 s12, s5, 28 +; GFX6-NEXT: s_lshr_b32 s8, s5, 26 +; GFX6-NEXT: s_bfe_i64 s[78:79], s[4:5], 0x10000 +; GFX6-NEXT: s_ashr_i32 s7, s5, 31 +; GFX6-NEXT: s_lshr_b32 s4, s5, 27 +; GFX6-NEXT: v_mov_b32_e32 v4, s7 +; GFX6-NEXT: s_mov_b32 s80, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s78 +; GFX6-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v1, s79 +; GFX6-NEXT: s_lshr_b32 s78, s5, 25 +; GFX6-NEXT: v_mov_b32_e32 v6, s80 +; GFX6-NEXT: s_bfe_i64 s[82:83], s[76:77], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v7, s81 +; GFX6-NEXT: s_lshr_b32 s76, s5, 24 +; GFX6-NEXT: v_mov_b32_e32 v2, s82 +; GFX6-NEXT: s_bfe_i64 s[80:81], s[74:75], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v3, s83 +; GFX6-NEXT: s_lshr_b32 s74, s5, 22 +; GFX6-NEXT: v_mov_b32_e32 v8, s80 +; GFX6-NEXT: s_bfe_i64 s[82:83], s[72:73], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v9, s81 +; GFX6-NEXT: s_lshr_b32 s72, s5, 23 +; GFX6-NEXT: v_mov_b32_e32 v10, s82 +; GFX6-NEXT: s_bfe_i64 s[80:81], s[70:71], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v11, s83 +; GFX6-NEXT: s_lshr_b32 s70, s5, 20 +; GFX6-NEXT: v_mov_b32_e32 v12, s80 +; GFX6-NEXT: s_bfe_i64 s[82:83], s[68:69], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v13, s81 +; GFX6-NEXT: s_lshr_b32 s68, s5, 21 +; GFX6-NEXT: v_mov_b32_e32 v14, s82 +; GFX6-NEXT: s_bfe_i64 s[80:81], s[66:67], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v15, s83 +; GFX6-NEXT: s_lshr_b32 s66, s5, 18 +; GFX6-NEXT: v_mov_b32_e32 v16, s80 +; GFX6-NEXT: s_bfe_i64 s[82:83], s[64:65], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v17, s81 +; GFX6-NEXT: s_lshr_b32 s64, s5, 19 +; GFX6-NEXT: v_mov_b32_e32 v18, s82 +; GFX6-NEXT: s_lshr_b32 s80, s5, 17 +; GFX6-NEXT: v_mov_b32_e32 v19, s83 +; GFX6-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v5, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 9 -; GFX6-NEXT: v_mov_b32_e32 v4, s28 -; GFX6-NEXT: v_mov_b32_e32 v5, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 6 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s62 +; GFX6-NEXT: s_bfe_i64 s[82:83], s[60:61], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v3, s63 +; GFX6-NEXT: s_lshr_b32 s60, s5, 16 +; GFX6-NEXT: v_mov_b32_e32 v4, s82 +; GFX6-NEXT: s_lshr_b32 s62, s5, 14 +; GFX6-NEXT: v_mov_b32_e32 v5, s83 +; GFX6-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s22 -; GFX6-NEXT: v_mov_b32_e32 v9, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 7 -; GFX6-NEXT: v_mov_b32_e32 v10, s24 -; GFX6-NEXT: v_mov_b32_e32 v11, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 4 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s58 +; GFX6-NEXT: s_bfe_i64 s[82:83], s[56:57], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v9, s59 +; GFX6-NEXT: s_lshr_b32 s56, s5, 15 +; GFX6-NEXT: v_mov_b32_e32 v10, s82 +; GFX6-NEXT: s_lshr_b32 s58, s5, 12 +; GFX6-NEXT: v_mov_b32_e32 v11, s83 +; GFX6-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s18 -; GFX6-NEXT: v_mov_b32_e32 v13, s19 -; GFX6-NEXT: s_lshr_b32 s18, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v14, s20 -; GFX6-NEXT: v_mov_b32_e32 v15, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 2 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:208 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 3 -; GFX6-NEXT: v_mov_b32_e32 v4, s16 -; GFX6-NEXT: v_mov_b32_e32 v5, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 1 -; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 +; GFX6-NEXT: v_mov_b32_e32 v12, s54 +; GFX6-NEXT: s_bfe_i64 s[82:83], s[52:53], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v13, s55 +; GFX6-NEXT: s_lshr_b32 s52, s5, 13 +; GFX6-NEXT: v_mov_b32_e32 v14, s82 +; GFX6-NEXT: s_lshr_b32 s54, s5, 10 +; GFX6-NEXT: v_mov_b32_e32 v15, s83 +; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s10 -; GFX6-NEXT: v_mov_b32_e32 v9, s11 -; GFX6-NEXT: s_lshr_b32 s10, s5, 29 -; GFX6-NEXT: v_mov_b32_e32 v10, s12 -; GFX6-NEXT: v_mov_b32_e32 v11, s13 -; GFX6-NEXT: s_lshr_b32 s12, s5, 28 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176 +; GFX6-NEXT: v_mov_b32_e32 v16, s50 +; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v17, s51 +; GFX6-NEXT: s_lshr_b32 s50, s5, 11 +; GFX6-NEXT: v_mov_b32_e32 v18, s48 +; GFX6-NEXT: s_lshr_b32 s48, s5, 8 +; GFX6-NEXT: v_mov_b32_e32 v19, s49 +; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NEXT: v_mov_b32_e32 v13, s7 -; GFX6-NEXT: s_lshr_b32 s6, s5, 26 -; GFX6-NEXT: v_mov_b32_e32 v14, s8 -; GFX6-NEXT: v_mov_b32_e32 v15, s9 -; GFX6-NEXT: s_lshr_b32 s8, s5, 27 +; GFX6-NEXT: v_mov_b32_e32 v2, s46 ; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: s_lshr_b32 s40, s5, 25 +; GFX6-NEXT: v_mov_b32_e32 v3, s47 +; GFX6-NEXT: s_lshr_b32 s46, s5, 9 ; GFX6-NEXT: v_mov_b32_e32 v4, s44 +; GFX6-NEXT: s_lshr_b32 s44, s5, 6 ; GFX6-NEXT: v_mov_b32_e32 v5, s45 -; GFX6-NEXT: s_lshr_b32 s44, s5, 24 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v8, s42 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v9, s43 -; GFX6-NEXT: s_lshr_b32 s42, s5, 22 -; GFX6-NEXT: v_mov_b32_e32 v10, s36 -; GFX6-NEXT: v_mov_b32_e32 v11, s37 -; GFX6-NEXT: s_lshr_b32 s36, s5, 23 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: s_lshr_b32 s42, s5, 7 +; GFX6-NEXT: v_mov_b32_e32 v10, s40 +; GFX6-NEXT: s_lshr_b32 s40, s5, 4 +; GFX6-NEXT: v_mov_b32_e32 v11, s41 ; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v12, s38 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v13, s39 -; GFX6-NEXT: s_lshr_b32 s38, s5, 20 -; GFX6-NEXT: v_mov_b32_e32 v14, s30 -; GFX6-NEXT: v_mov_b32_e32 v15, s31 -; GFX6-NEXT: s_lshr_b32 s4, s5, 21 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[34:35], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 +; GFX6-NEXT: s_lshr_b32 s38, s5, 5 +; GFX6-NEXT: v_mov_b32_e32 v14, s36 +; GFX6-NEXT: s_lshr_b32 s36, s5, 2 +; GFX6-NEXT: v_mov_b32_e32 v15, s37 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s30 -; GFX6-NEXT: v_mov_b32_e32 v3, s31 -; GFX6-NEXT: s_lshr_b32 s30, s5, 18 -; GFX6-NEXT: v_mov_b32_e32 v4, s26 -; GFX6-NEXT: v_mov_b32_e32 v5, s27 -; GFX6-NEXT: s_lshr_b32 s26, s5, 19 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v16, s34 +; GFX6-NEXT: s_lshr_b32 s34, s5, 3 +; GFX6-NEXT: v_mov_b32_e32 v17, s35 +; GFX6-NEXT: s_bfe_i64 s[82:83], s[30:31], 0x10000 +; GFX6-NEXT: s_lshr_b32 s30, s5, 1 +; GFX6-NEXT: v_mov_b32_e32 v18, s82 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v19, s83 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s28 -; GFX6-NEXT: v_mov_b32_e32 v9, s29 -; GFX6-NEXT: s_lshr_b32 s28, s5, 17 -; GFX6-NEXT: v_mov_b32_e32 v10, s22 -; GFX6-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NEXT: s_lshr_b32 s22, s5, 16 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NEXT: s_bfe_i64 s[82:83], s[26:27], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[34:35], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v4, s82 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[36:37], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v5, s83 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s24 -; GFX6-NEXT: v_mov_b32_e32 v13, s25 -; GFX6-NEXT: s_lshr_b32 s24, s5, 14 -; GFX6-NEXT: v_mov_b32_e32 v14, s18 -; GFX6-NEXT: v_mov_b32_e32 v15, s19 -; GFX6-NEXT: s_lshr_b32 s18, s5, 15 +; GFX6-NEXT: v_mov_b32_e32 v8, s24 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[22:23], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v9, s25 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[38:39], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v10, s34 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[40:41], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v11, s35 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 -; GFX6-NEXT: v_mov_b32_e32 v16, s20 -; GFX6-NEXT: v_mov_b32_e32 v17, s21 -; GFX6-NEXT: s_lshr_b32 s20, s5, 12 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v12, s20 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[18:19], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v13, s21 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v14, s34 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[44:45], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v15, s35 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v16, s16 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v17, s17 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[46:47], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v18, s14 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[48:49], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v19, s15 -; GFX6-NEXT: s_lshr_b32 s14, s5, 13 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: s_lshr_b32 s16, s5, 10 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 +; GFX6-NEXT: s_bfe_i64 s[10:11], s[50:51], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v20, s12 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[54:55], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v21, s13 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s12 -; GFX6-NEXT: v_mov_b32_e32 v9, s13 -; GFX6-NEXT: s_lshr_b32 s12, s5, 11 -; GFX6-NEXT: v_mov_b32_e32 v10, s10 -; GFX6-NEXT: v_mov_b32_e32 v11, s11 -; GFX6-NEXT: s_lshr_b32 s10, s5, 8 +; GFX6-NEXT: v_mov_b32_e32 v22, s6 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v23, s7 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[52:53], 0x10000 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v8, s8 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v9, s9 +; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GFX6-NEXT: v_mov_b32_e32 v10, s4 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[76:77], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v11, s5 +; GFX6-NEXT: s_bfe_i64 s[4:5], s[56:57], 0x10000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NEXT: v_mov_b32_e32 v13, s7 -; GFX6-NEXT: s_lshr_b32 s6, s5, 9 +; GFX6-NEXT: v_mov_b32_e32 v12, s8 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[62:63], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v13, s9 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[78:79], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 ; GFX6-NEXT: v_mov_b32_e32 v14, s8 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[74:75], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v15, s9 -; GFX6-NEXT: s_lshr_b32 s8, s5, 6 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[60:61], 0x10000 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v16, s34 -; GFX6-NEXT: v_mov_b32_e32 v17, s35 -; GFX6-NEXT: s_lshr_b32 s34, s5, 7 -; GFX6-NEXT: v_mov_b32_e32 v18, s40 -; GFX6-NEXT: v_mov_b32_e32 v19, s41 -; GFX6-NEXT: s_lshr_b32 s40, s5, 4 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v16, s38 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[80:81], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v17, s39 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[72:73], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v18, s38 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[70:71], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v19, s39 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[64:65], 0x10000 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s42 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[66:67], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NEXT: s_lshr_b32 s42, s5, 5 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NEXT: s_lshr_b32 s36, s5, 2 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[68:69], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:480 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:464 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:448 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:432 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:416 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s38 -; GFX6-NEXT: v_mov_b32_e32 v9, s39 -; GFX6-NEXT: s_lshr_b32 s38, s5, 3 -; GFX6-NEXT: s_lshr_b32 s44, s5, 1 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464 -; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:448 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432 -; GFX6-NEXT: v_mov_b32_e32 v10, s4 -; GFX6-NEXT: v_mov_b32_e32 v11, s5 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:416 -; GFX6-NEXT: s_waitcnt expcnt(1) -; GFX6-NEXT: v_mov_b32_e32 v0, s30 -; GFX6-NEXT: v_mov_b32_e32 v1, s31 -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NEXT: v_mov_b32_e32 v0, s44 +; GFX6-NEXT: v_mov_b32_e32 v1, s45 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s22 -; GFX6-NEXT: v_mov_b32_e32 v1, s23 -; GFX6-NEXT: v_mov_b32_e32 v2, s28 -; GFX6-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s24 -; GFX6-NEXT: v_mov_b32_e32 v1, s25 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NEXT: v_mov_b32_e32 v0, s36 +; GFX6-NEXT: v_mov_b32_e32 v1, s37 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mov_b32_e32 v1, s17 -; GFX6-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NEXT: v_mov_b32_e32 v3, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s34 +; GFX6-NEXT: v_mov_b32_e32 v1, s35 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: v_mov_b32_e32 v3, s19 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s41 -; GFX6-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NEXT: v_mov_b32_e32 v0, s24 +; GFX6-NEXT: v_mov_b32_e32 v1, s25 +; GFX6-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NEXT: v_mov_b32_e32 v3, s23 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s36 -; GFX6-NEXT: v_mov_b32_e32 v1, s37 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: v_mov_b32_e32 v0, s28 +; GFX6-NEXT: v_mov_b32_e32 v1, s29 +; GFX6-NEXT: v_mov_b32_e32 v2, s26 +; GFX6-NEXT: v_mov_b32_e32 v3, s27 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272 -; GFX6-NEXT: v_mov_b32_e32 v8, s44 -; GFX6-NEXT: v_mov_b32_e32 v9, s45 +; GFX6-NEXT: v_mov_b32_e32 v8, s30 +; GFX6-NEXT: v_mov_b32_e32 v9, s31 ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX8-NEXT: ; implicit-def: $vgpr50 : SGPR spill to VGPR lane ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s4, s3, 8 -; GFX8-NEXT: s_lshr_b32 s48, s3, 15 -; GFX8-NEXT: v_writelane_b32 v62, s4, 0 -; GFX8-NEXT: s_lshr_b32 s74, s3, 30 -; GFX8-NEXT: s_lshr_b32 s30, s3, 31 -; GFX8-NEXT: s_lshr_b32 s72, s3, 28 -; GFX8-NEXT: s_lshr_b32 s34, s3, 29 -; GFX8-NEXT: s_lshr_b32 s70, s3, 26 -; GFX8-NEXT: s_lshr_b32 s36, s3, 27 -; GFX8-NEXT: s_lshr_b32 s68, s3, 24 -; GFX8-NEXT: s_lshr_b32 s38, s3, 25 -; GFX8-NEXT: s_lshr_b32 s64, s3, 22 -; GFX8-NEXT: s_lshr_b32 s40, s3, 23 -; GFX8-NEXT: s_lshr_b32 s60, s3, 20 -; GFX8-NEXT: s_lshr_b32 s42, s3, 21 -; GFX8-NEXT: s_lshr_b32 s66, s3, 18 -; GFX8-NEXT: s_lshr_b32 s44, s3, 19 -; GFX8-NEXT: s_lshr_b32 s56, s3, 16 -; GFX8-NEXT: s_lshr_b32 s46, s3, 17 -; GFX8-NEXT: s_lshr_b32 s58, s3, 14 -; GFX8-NEXT: s_lshr_b32 s62, s3, 12 -; GFX8-NEXT: s_lshr_b32 s54, s3, 10 -; GFX8-NEXT: v_writelane_b32 v62, s5, 1 -; GFX8-NEXT: s_lshr_b32 s4, s3, 9 -; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX8-NEXT: s_lshr_b32 s52, s3, 11 -; GFX8-NEXT: v_writelane_b32 v62, s4, 2 -; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX8-NEXT: v_mov_b32_e32 v34, s48 -; GFX8-NEXT: s_lshr_b32 s48, s2, 1 -; GFX8-NEXT: s_lshr_b32 s50, s3, 13 -; GFX8-NEXT: v_writelane_b32 v62, s5, 3 -; GFX8-NEXT: s_lshr_b32 s8, s3, 6 -; GFX8-NEXT: s_lshr_b32 s10, s3, 7 -; GFX8-NEXT: s_lshr_b32 s12, s3, 4 -; GFX8-NEXT: s_lshr_b32 s14, s3, 5 -; GFX8-NEXT: s_lshr_b32 s16, s3, 2 -; GFX8-NEXT: s_lshr_b32 s18, s3, 3 -; GFX8-NEXT: s_lshr_b32 s20, s3, 1 -; GFX8-NEXT: s_mov_b32 s22, s3 -; GFX8-NEXT: s_lshr_b32 s24, s2, 30 -; GFX8-NEXT: s_lshr_b32 s26, s2, 31 -; GFX8-NEXT: s_lshr_b32 s28, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v4, s74 -; GFX8-NEXT: v_mov_b32_e32 v12, s72 -; GFX8-NEXT: v_mov_b32_e32 v0, s70 -; GFX8-NEXT: v_mov_b32_e32 v8, s68 -; GFX8-NEXT: v_mov_b32_e32 v16, s64 -; GFX8-NEXT: v_mov_b32_e32 v20, s60 -; GFX8-NEXT: v_mov_b32_e32 v24, s66 -; GFX8-NEXT: v_mov_b32_e32 v28, s56 -; GFX8-NEXT: v_mov_b32_e32 v32, s58 -; GFX8-NEXT: v_mov_b32_e32 v36, s62 -; GFX8-NEXT: s_lshr_b32 s86, s2, 29 -; GFX8-NEXT: v_mov_b32_e32 v40, s54 -; GFX8-NEXT: s_lshr_b32 s84, s2, 26 -; GFX8-NEXT: s_lshr_b32 s82, s2, 27 -; GFX8-NEXT: s_bfe_i64 vcc, s[52:53], 0x10000 -; GFX8-NEXT: s_lshr_b32 s80, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v6, s30 -; GFX8-NEXT: v_mov_b32_e32 v7, s31 -; GFX8-NEXT: s_lshr_b32 s78, s2, 25 -; GFX8-NEXT: s_lshr_b32 s76, s2, 22 -; GFX8-NEXT: v_mov_b32_e32 v14, s34 -; GFX8-NEXT: s_lshr_b32 s74, s2, 23 -; GFX8-NEXT: s_lshr_b32 s72, s2, 20 -; GFX8-NEXT: v_mov_b32_e32 v2, s36 -; GFX8-NEXT: s_lshr_b32 s70, s2, 21 -; GFX8-NEXT: s_lshr_b32 s68, s2, 18 -; GFX8-NEXT: v_mov_b32_e32 v10, s38 -; GFX8-NEXT: s_lshr_b32 s66, s2, 19 -; GFX8-NEXT: s_lshr_b32 s64, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v18, s40 -; GFX8-NEXT: s_lshr_b32 s62, s2, 17 -; GFX8-NEXT: s_lshr_b32 s60, s2, 14 -; GFX8-NEXT: v_mov_b32_e32 v22, s42 -; GFX8-NEXT: s_lshr_b32 s58, s2, 15 -; GFX8-NEXT: s_lshr_b32 s56, s2, 12 -; GFX8-NEXT: v_mov_b32_e32 v26, s44 -; GFX8-NEXT: s_lshr_b32 s54, s2, 13 -; GFX8-NEXT: s_lshr_b32 s52, s2, 10 -; GFX8-NEXT: v_mov_b32_e32 v30, s46 -; GFX8-NEXT: s_lshr_b32 s6, s2, 11 -; GFX8-NEXT: s_lshr_b32 s4, s2, 8 -; GFX8-NEXT: s_lshr_b32 s46, s2, 9 -; GFX8-NEXT: s_lshr_b32 s44, s2, 6 -; GFX8-NEXT: s_lshr_b32 s42, s2, 7 -; GFX8-NEXT: s_lshr_b32 s40, s2, 4 -; GFX8-NEXT: s_lshr_b32 s38, s2, 5 -; GFX8-NEXT: s_lshr_b32 s36, s2, 2 -; GFX8-NEXT: s_lshr_b32 s34, s2, 3 -; GFX8-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[2:3], s[48:49], 0x10000 -; GFX8-NEXT: v_writelane_b32 v62, s2, 4 -; GFX8-NEXT: v_writelane_b32 v62, s3, 5 -; GFX8-NEXT: v_readlane_b32 s2, v62, 2 -; GFX8-NEXT: v_readlane_b32 s3, v62, 3 -; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX8-NEXT: v_mov_b32_e32 v35, s49 -; GFX8-NEXT: s_bfe_i64 s[48:49], s[4:5], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 -; GFX8-NEXT: v_readlane_b32 s2, v62, 0 -; GFX8-NEXT: v_readlane_b32 s3, v62, 1 -; GFX8-NEXT: v_mov_b32_e32 v5, s75 -; GFX8-NEXT: v_mov_b32_e32 v13, s73 -; GFX8-NEXT: v_mov_b32_e32 v15, s35 -; GFX8-NEXT: v_mov_b32_e32 v1, s71 -; GFX8-NEXT: v_mov_b32_e32 v3, s37 -; GFX8-NEXT: v_mov_b32_e32 v9, s69 -; GFX8-NEXT: v_mov_b32_e32 v11, s39 -; GFX8-NEXT: v_mov_b32_e32 v17, s65 -; GFX8-NEXT: v_mov_b32_e32 v19, s41 -; GFX8-NEXT: v_mov_b32_e32 v21, s61 -; GFX8-NEXT: v_mov_b32_e32 v23, s43 -; GFX8-NEXT: v_mov_b32_e32 v25, s67 -; GFX8-NEXT: v_mov_b32_e32 v27, s45 -; GFX8-NEXT: v_mov_b32_e32 v29, s57 -; GFX8-NEXT: v_mov_b32_e32 v31, s47 -; GFX8-NEXT: v_mov_b32_e32 v33, s59 -; GFX8-NEXT: v_mov_b32_e32 v37, s63 -; GFX8-NEXT: v_mov_b32_e32 v38, s50 -; GFX8-NEXT: v_mov_b32_e32 v39, s51 -; GFX8-NEXT: v_mov_b32_e32 v41, s55 -; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[50:51], s[6:7], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX8-NEXT: s_lshr_b32 s2, s1, 18 +; GFX8-NEXT: v_writelane_b32 v50, s2, 0 +; GFX8-NEXT: v_writelane_b32 v50, s3, 1 +; GFX8-NEXT: s_lshr_b32 s2, s1, 19 +; GFX8-NEXT: v_writelane_b32 v50, s2, 2 +; GFX8-NEXT: v_writelane_b32 v50, s3, 3 +; GFX8-NEXT: s_lshr_b32 s2, s1, 16 +; GFX8-NEXT: v_writelane_b32 v50, s2, 4 +; GFX8-NEXT: v_writelane_b32 v50, s3, 5 +; GFX8-NEXT: s_lshr_b32 s2, s1, 17 +; GFX8-NEXT: v_writelane_b32 v50, s2, 6 +; GFX8-NEXT: v_writelane_b32 v50, s3, 7 +; GFX8-NEXT: s_lshr_b32 s2, s1, 14 +; GFX8-NEXT: v_writelane_b32 v50, s2, 8 +; GFX8-NEXT: v_writelane_b32 v50, s3, 9 +; GFX8-NEXT: s_lshr_b32 s2, s1, 15 +; GFX8-NEXT: v_writelane_b32 v50, s2, 10 +; GFX8-NEXT: v_writelane_b32 v50, s3, 11 +; GFX8-NEXT: s_lshr_b32 s2, s1, 12 +; GFX8-NEXT: v_writelane_b32 v50, s2, 12 +; GFX8-NEXT: v_writelane_b32 v50, s3, 13 +; GFX8-NEXT: s_lshr_b32 s2, s1, 13 +; GFX8-NEXT: v_writelane_b32 v50, s2, 14 +; GFX8-NEXT: v_writelane_b32 v50, s3, 15 +; GFX8-NEXT: s_lshr_b32 s2, s1, 10 +; GFX8-NEXT: v_writelane_b32 v50, s2, 16 +; GFX8-NEXT: v_writelane_b32 v50, s3, 17 +; GFX8-NEXT: s_lshr_b32 s2, s1, 11 +; GFX8-NEXT: v_writelane_b32 v50, s2, 18 +; GFX8-NEXT: v_writelane_b32 v50, s3, 19 +; GFX8-NEXT: s_lshr_b32 s2, s1, 8 +; GFX8-NEXT: v_writelane_b32 v50, s2, 20 +; GFX8-NEXT: v_writelane_b32 v50, s3, 21 +; GFX8-NEXT: s_lshr_b32 s2, s1, 9 +; GFX8-NEXT: v_writelane_b32 v50, s2, 22 +; GFX8-NEXT: v_writelane_b32 v50, s3, 23 +; GFX8-NEXT: s_lshr_b32 s2, s1, 6 +; GFX8-NEXT: v_writelane_b32 v50, s2, 24 +; GFX8-NEXT: v_writelane_b32 v50, s3, 25 +; GFX8-NEXT: s_lshr_b32 s2, s1, 7 +; GFX8-NEXT: v_writelane_b32 v50, s2, 26 +; GFX8-NEXT: v_writelane_b32 v50, s3, 27 +; GFX8-NEXT: s_lshr_b32 s2, s1, 4 +; GFX8-NEXT: v_writelane_b32 v50, s2, 28 +; GFX8-NEXT: v_writelane_b32 v50, s3, 29 +; GFX8-NEXT: s_lshr_b32 s2, s1, 5 +; GFX8-NEXT: v_writelane_b32 v50, s2, 30 +; GFX8-NEXT: v_writelane_b32 v50, s3, 31 +; GFX8-NEXT: s_lshr_b32 s2, s1, 2 +; GFX8-NEXT: v_writelane_b32 v50, s2, 32 +; GFX8-NEXT: v_writelane_b32 v50, s3, 33 +; GFX8-NEXT: s_lshr_b32 s2, s1, 3 +; GFX8-NEXT: v_writelane_b32 v50, s2, 34 +; GFX8-NEXT: v_writelane_b32 v50, s3, 35 +; GFX8-NEXT: s_lshr_b32 s2, s1, 1 +; GFX8-NEXT: v_writelane_b32 v50, s2, 36 +; GFX8-NEXT: v_writelane_b32 v50, s3, 37 +; GFX8-NEXT: s_mov_b32 s2, s1 +; GFX8-NEXT: v_writelane_b32 v50, s2, 38 +; GFX8-NEXT: v_writelane_b32 v50, s3, 39 +; GFX8-NEXT: s_lshr_b32 s2, s0, 30 +; GFX8-NEXT: v_writelane_b32 v50, s2, 40 +; GFX8-NEXT: s_lshr_b32 s38, s1, 30 +; GFX8-NEXT: s_lshr_b32 s40, s1, 31 +; GFX8-NEXT: s_lshr_b32 s42, s1, 28 +; GFX8-NEXT: s_lshr_b32 s48, s1, 29 +; GFX8-NEXT: s_lshr_b32 s50, s1, 26 +; GFX8-NEXT: s_lshr_b32 s54, s1, 27 +; GFX8-NEXT: s_lshr_b32 s60, s1, 24 +; GFX8-NEXT: s_lshr_b32 s66, s1, 25 +; GFX8-NEXT: s_lshr_b32 s70, s1, 22 +; GFX8-NEXT: s_lshr_b32 s76, s1, 23 +; GFX8-NEXT: s_lshr_b32 s80, s1, 20 +; GFX8-NEXT: s_lshr_b32 vcc_lo, s1, 21 +; GFX8-NEXT: v_writelane_b32 v50, s3, 41 +; GFX8-NEXT: s_lshr_b32 s86, s0, 31 +; GFX8-NEXT: s_lshr_b32 s84, s0, 28 +; GFX8-NEXT: s_lshr_b32 s82, s0, 29 +; GFX8-NEXT: s_lshr_b32 s78, s0, 26 +; GFX8-NEXT: s_lshr_b32 s74, s0, 27 +; GFX8-NEXT: s_lshr_b32 s72, s0, 24 +; GFX8-NEXT: s_lshr_b32 s68, s0, 25 +; GFX8-NEXT: s_lshr_b32 s64, s0, 22 +; GFX8-NEXT: s_lshr_b32 s62, s0, 23 +; GFX8-NEXT: s_lshr_b32 s58, s0, 20 +; GFX8-NEXT: s_lshr_b32 s56, s0, 21 +; GFX8-NEXT: s_lshr_b32 s52, s0, 18 +; GFX8-NEXT: s_lshr_b32 s46, s0, 19 +; GFX8-NEXT: s_lshr_b32 s44, s0, 16 +; GFX8-NEXT: s_lshr_b32 s36, s0, 17 +; GFX8-NEXT: s_lshr_b32 s34, s0, 14 +; GFX8-NEXT: s_lshr_b32 s30, s0, 15 +; GFX8-NEXT: s_lshr_b32 s28, s0, 12 +; GFX8-NEXT: s_lshr_b32 s26, s0, 13 +; GFX8-NEXT: s_lshr_b32 s24, s0, 10 +; GFX8-NEXT: s_lshr_b32 s22, s0, 11 +; GFX8-NEXT: s_lshr_b32 s20, s0, 8 +; GFX8-NEXT: s_lshr_b32 s18, s0, 9 +; GFX8-NEXT: s_lshr_b32 s16, s0, 6 +; GFX8-NEXT: s_lshr_b32 s14, s0, 7 +; GFX8-NEXT: s_lshr_b32 s10, s0, 4 +; GFX8-NEXT: s_lshr_b32 s12, s0, 5 +; GFX8-NEXT: s_lshr_b32 s6, s0, 2 +; GFX8-NEXT: s_lshr_b32 s4, s0, 3 +; GFX8-NEXT: s_lshr_b32 s2, s0, 1 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 +; GFX8-NEXT: v_writelane_b32 v50, s0, 42 +; GFX8-NEXT: v_writelane_b32 v50, s1, 43 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x10000 +; GFX8-NEXT: v_writelane_b32 v50, s0, 44 +; GFX8-NEXT: v_writelane_b32 v50, s1, 45 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[4:5], 0x10000 +; GFX8-NEXT: v_writelane_b32 v50, s0, 46 +; GFX8-NEXT: v_writelane_b32 v50, s1, 47 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[6:7], 0x10000 +; GFX8-NEXT: v_writelane_b32 v50, s0, 48 +; GFX8-NEXT: v_writelane_b32 v50, s1, 49 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[12:13], 0x10000 +; GFX8-NEXT: v_writelane_b32 v50, s0, 50 +; GFX8-NEXT: v_writelane_b32 v50, s1, 51 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x10000 +; GFX8-NEXT: v_writelane_b32 v50, s0, 52 +; GFX8-NEXT: v_writelane_b32 v50, s1, 53 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[14:15], 0x10000 +; GFX8-NEXT: v_writelane_b32 v50, s0, 54 +; GFX8-NEXT: v_writelane_b32 v50, s1, 55 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[16:17], 0x10000 +; GFX8-NEXT: v_writelane_b32 v50, s0, 56 +; GFX8-NEXT: v_writelane_b32 v50, s1, 57 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[18:19], 0x10000 +; GFX8-NEXT: v_writelane_b32 v50, s0, 58 +; GFX8-NEXT: v_writelane_b32 v50, s1, 59 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[38:39], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v24, s0 +; GFX8-NEXT: v_mov_b32_e32 v25, s1 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[40:41], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v26, s0 +; GFX8-NEXT: v_mov_b32_e32 v27, s1 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[42:43], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v20, s0 +; GFX8-NEXT: v_mov_b32_e32 v21, s1 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[48:49], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v22, s0 +; GFX8-NEXT: v_mov_b32_e32 v23, s1 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[50:51], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[54:55], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[60:61], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[2:3], s[20:21], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[66:67], 0x10000 +; GFX8-NEXT: v_writelane_b32 v50, s2, 60 +; GFX8-NEXT: v_mov_b32_e32 v6, s0 +; GFX8-NEXT: v_mov_b32_e32 v7, s1 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[70:71], 0x10000 +; GFX8-NEXT: v_writelane_b32 v50, s3, 61 +; GFX8-NEXT: s_bfe_i64 s[2:3], s[28:29], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v12, s0 +; GFX8-NEXT: v_mov_b32_e32 v13, s1 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[76:77], 0x10000 +; GFX8-NEXT: v_writelane_b32 v50, s2, 62 +; GFX8-NEXT: v_mov_b32_e32 v14, s0 +; GFX8-NEXT: v_mov_b32_e32 v15, s1 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[80:81], 0x10000 +; GFX8-NEXT: v_writelane_b32 v50, s3, 63 +; GFX8-NEXT: v_mov_b32_e32 v28, s0 +; GFX8-NEXT: v_mov_b32_e32 v29, s1 +; GFX8-NEXT: s_bfe_i64 s[0:1], vcc, 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v30, s0 +; GFX8-NEXT: v_mov_b32_e32 v31, s1 +; GFX8-NEXT: v_readlane_b32 s0, v50, 0 +; GFX8-NEXT: v_readlane_b32 s1, v50, 1 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 +; GFX8-NEXT: v_readlane_b32 s0, v50, 2 +; GFX8-NEXT: v_readlane_b32 s1, v50, 3 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_readlane_b32 s0, v50, 4 +; GFX8-NEXT: v_readlane_b32 s1, v50, 5 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: v_mov_b32_e32 v17, s1 +; GFX8-NEXT: v_readlane_b32 s0, v50, 6 +; GFX8-NEXT: v_readlane_b32 s1, v50, 7 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v18, s0 +; GFX8-NEXT: v_mov_b32_e32 v19, s1 +; GFX8-NEXT: v_readlane_b32 s0, v50, 8 +; GFX8-NEXT: v_readlane_b32 s1, v50, 9 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v32, s0 +; GFX8-NEXT: v_mov_b32_e32 v33, s1 +; GFX8-NEXT: v_readlane_b32 s0, v50, 10 +; GFX8-NEXT: v_readlane_b32 s1, v50, 11 +; GFX8-NEXT: s_bfe_i64 s[4:5], s[0:1], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v34, s4 +; GFX8-NEXT: v_mov_b32_e32 v35, s5 +; GFX8-NEXT: v_readlane_b32 s4, v50, 12 +; GFX8-NEXT: v_readlane_b32 s5, v50, 13 +; GFX8-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v36, s4 +; GFX8-NEXT: v_mov_b32_e32 v37, s5 +; GFX8-NEXT: v_readlane_b32 s4, v50, 14 +; GFX8-NEXT: v_readlane_b32 s5, v50, 15 +; GFX8-NEXT: s_bfe_i64 s[10:11], s[4:5], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v38, s10 +; GFX8-NEXT: v_mov_b32_e32 v39, s11 +; GFX8-NEXT: v_readlane_b32 s10, v50, 16 +; GFX8-NEXT: v_readlane_b32 s11, v50, 17 +; GFX8-NEXT: s_bfe_i64 s[12:13], s[10:11], 0x10000 +; GFX8-NEXT: v_readlane_b32 s2, v50, 40 +; GFX8-NEXT: v_readlane_b32 s3, v50, 41 +; GFX8-NEXT: v_mov_b32_e32 v40, s12 +; GFX8-NEXT: v_mov_b32_e32 v41, s13 +; GFX8-NEXT: v_readlane_b32 s12, v50, 18 +; GFX8-NEXT: v_readlane_b32 s13, v50, 19 +; GFX8-NEXT: s_bfe_i64 s[80:81], s[2:3], 0x10000 +; GFX8-NEXT: v_readlane_b32 s2, v50, 38 +; GFX8-NEXT: v_readlane_b32 s3, v50, 39 +; GFX8-NEXT: s_bfe_i64 s[14:15], s[12:13], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[28:29], s[34:35], 0x10000 +; GFX8-NEXT: s_bfe_i64 vcc, s[2:3], 0x10000 +; GFX8-NEXT: v_readlane_b32 s2, v50, 36 +; GFX8-NEXT: v_readlane_b32 s3, v50, 37 +; GFX8-NEXT: v_readlane_b32 s0, v50, 34 +; GFX8-NEXT: v_readlane_b32 s1, v50, 35 +; GFX8-NEXT: v_readlane_b32 s6, v50, 32 +; GFX8-NEXT: v_readlane_b32 s7, v50, 33 +; GFX8-NEXT: v_readlane_b32 s4, v50, 30 +; GFX8-NEXT: v_readlane_b32 s5, v50, 31 +; GFX8-NEXT: v_readlane_b32 s10, v50, 28 +; GFX8-NEXT: v_readlane_b32 s11, v50, 29 +; GFX8-NEXT: v_mov_b32_e32 v42, s14 +; GFX8-NEXT: v_readlane_b32 s12, v50, 26 +; GFX8-NEXT: v_readlane_b32 s13, v50, 27 +; GFX8-NEXT: v_mov_b32_e32 v43, s15 +; GFX8-NEXT: v_readlane_b32 s14, v50, 24 +; GFX8-NEXT: v_readlane_b32 s15, v50, 25 +; GFX8-NEXT: v_readlane_b32 s16, v50, 22 +; GFX8-NEXT: v_readlane_b32 s17, v50, 23 +; GFX8-NEXT: v_readlane_b32 s34, v50, 20 +; GFX8-NEXT: v_readlane_b32 s35, v50, 21 +; GFX8-NEXT: s_bfe_i64 s[40:41], s[22:23], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[42:43], s[24:25], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[48:49], s[26:27], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[54:55], s[30:31], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[30:31], s[36:37], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[20:21], s[46:47], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[24:25], s[52:53], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[44:45], s[56:57], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[46:47], s[58:59], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[18:19], s[62:63], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[64:65], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[52:53], s[68:69], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[22:23], s[72:73], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[56:57], s[74:75], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[58:59], s[78:79], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[60:61], s[82:83], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[62:63], s[84:85], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[70:71], s[86:87], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000 -; GFX8-NEXT: s_add_u32 s2, s0, 0x1f0 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v43, s3 -; GFX8-NEXT: v_mov_b32_e32 v42, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x1e0 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v45, s3 -; GFX8-NEXT: v_mov_b32_e32 v44, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x1d0 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v47, s3 -; GFX8-NEXT: v_mov_b32_e32 v46, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x1c0 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v49, s3 -; GFX8-NEXT: v_mov_b32_e32 v48, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x1b0 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v51, s3 -; GFX8-NEXT: v_mov_b32_e32 v50, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x1a0 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v53, s3 -; GFX8-NEXT: v_mov_b32_e32 v52, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x190 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v55, s3 -; GFX8-NEXT: v_mov_b32_e32 v54, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x180 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v57, s3 -; GFX8-NEXT: v_mov_b32_e32 v56, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x170 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v59, s3 -; GFX8-NEXT: v_mov_b32_e32 v58, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x160 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v61, s3 -; GFX8-NEXT: v_mov_b32_e32 v60, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x150 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15] -; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19] -; GFX8-NEXT: v_mov_b32_e32 v13, s3 -; GFX8-NEXT: v_mov_b32_e32 v12, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x140 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x130 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v17, s3 -; GFX8-NEXT: v_mov_b32_e32 v16, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x120 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s3 -; GFX8-NEXT: v_mov_b32_e32 v18, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x110 -; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo -; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: v_mov_b32_e32 v7, s5 -; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mov_b32_e32 v9, s13 +; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX8-NEXT: s_add_u32 s36, s8, 0x1f0 +; GFX8-NEXT: s_addc_u32 s37, s9, 0 +; GFX8-NEXT: s_add_u32 s38, s8, 0x1e0 +; GFX8-NEXT: s_addc_u32 s39, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v44, s34 +; GFX8-NEXT: s_add_u32 s34, s8, 0x1d0 +; GFX8-NEXT: v_mov_b32_e32 v45, s35 +; GFX8-NEXT: v_mov_b32_e32 v47, s37 +; GFX8-NEXT: s_addc_u32 s35, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v46, s36 +; GFX8-NEXT: s_add_u32 s36, s8, 0x1c0 +; GFX8-NEXT: v_mov_b32_e32 v49, s39 +; GFX8-NEXT: s_addc_u32 s37, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v48, s38 +; GFX8-NEXT: s_add_u32 s38, s8, 0x1b0 +; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[24:27] +; GFX8-NEXT: s_addc_u32 s39, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v24, s34 +; GFX8-NEXT: v_mov_b32_e32 v25, s35 +; GFX8-NEXT: s_add_u32 s34, s8, 0x1a0 +; GFX8-NEXT: v_mov_b32_e32 v26, s36 +; GFX8-NEXT: s_addc_u32 s35, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v27, s37 +; GFX8-NEXT: s_add_u32 s36, s8, 0x190 +; GFX8-NEXT: v_mov_b32_e32 v47, s39 +; GFX8-NEXT: s_addc_u32 s37, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v46, s38 +; GFX8-NEXT: s_add_u32 s38, s8, 0x180 +; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[20:23] +; GFX8-NEXT: s_addc_u32 s39, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v22, s34 +; GFX8-NEXT: v_mov_b32_e32 v23, s35 +; GFX8-NEXT: s_add_u32 s34, s8, 0x170 +; GFX8-NEXT: s_addc_u32 s35, s9, 0 +; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[12:15] +; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[28:31] +; GFX8-NEXT: v_mov_b32_e32 v46, s16 +; GFX8-NEXT: s_add_u32 s16, s8, 0x160 +; GFX8-NEXT: v_mov_b32_e32 v47, s17 +; GFX8-NEXT: s_addc_u32 s17, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s16 +; GFX8-NEXT: v_mov_b32_e32 v7, s17 +; GFX8-NEXT: s_add_u32 s16, s8, 0x150 +; GFX8-NEXT: s_addc_u32 s17, s9, 0 +; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v20, s36 +; GFX8-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NEXT: s_add_u32 s14, s8, 0x140 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: s_addc_u32 s15, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v21, s37 +; GFX8-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NEXT: s_add_u32 s12, s8, 0x130 +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[8:11] +; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s16 ; GFX8-NEXT: v_mov_b32_e32 v10, s14 +; GFX8-NEXT: s_addc_u32 s13, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v12, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s17 ; GFX8-NEXT: v_mov_b32_e32 v11, s15 -; GFX8-NEXT: flat_store_dwordx4 v[56:57], v[28:31] -; GFX8-NEXT: flat_store_dwordx4 v[58:59], v[32:35] -; GFX8-NEXT: flat_store_dwordx4 v[60:61], v[36:39] -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[40:43] -; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x100 -; GFX8-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: v_mov_b32_e32 v3, s19 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0xf0 -; GFX8-NEXT: v_mov_b32_e32 v0, s22 -; GFX8-NEXT: v_mov_b32_e32 v1, s23 -; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: v_mov_b32_e32 v3, s21 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 -; GFX8-NEXT: v_mov_b32_e32 v0, s24 -; GFX8-NEXT: v_mov_b32_e32 v1, s25 -; GFX8-NEXT: v_mov_b32_e32 v2, s26 -; GFX8-NEXT: v_mov_b32_e32 v3, s27 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0xd0 -; GFX8-NEXT: v_mov_b32_e32 v0, s28 -; GFX8-NEXT: v_mov_b32_e32 v1, s29 -; GFX8-NEXT: v_mov_b32_e32 v2, s86 -; GFX8-NEXT: v_mov_b32_e32 v3, s87 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v13, s13 +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[36:39] +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[40:43] +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[44:47] +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: s_add_u32 s4, s8, 0x120 +; GFX8-NEXT: v_mov_b32_e32 v7, s5 +; GFX8-NEXT: s_addc_u32 s5, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0x110 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0x100 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0xc0 -; GFX8-NEXT: v_mov_b32_e32 v0, s84 -; GFX8-NEXT: v_mov_b32_e32 v1, s85 -; GFX8-NEXT: v_mov_b32_e32 v2, s82 -; GFX8-NEXT: v_mov_b32_e32 v3, s83 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xf0 +; GFX8-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX8-NEXT: v_mov_b32_e32 v1, vcc_hi +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0xb0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xe0 ; GFX8-NEXT: v_mov_b32_e32 v0, s80 ; GFX8-NEXT: v_mov_b32_e32 v1, s81 -; GFX8-NEXT: v_mov_b32_e32 v2, s78 -; GFX8-NEXT: v_mov_b32_e32 v3, s79 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 -; GFX8-NEXT: v_mov_b32_e32 v0, s76 -; GFX8-NEXT: v_mov_b32_e32 v1, s77 -; GFX8-NEXT: v_mov_b32_e32 v2, s74 -; GFX8-NEXT: v_mov_b32_e32 v3, s75 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x90 -; GFX8-NEXT: v_mov_b32_e32 v0, s72 -; GFX8-NEXT: v_mov_b32_e32 v1, s73 ; GFX8-NEXT: v_mov_b32_e32 v2, s70 ; GFX8-NEXT: v_mov_b32_e32 v3, s71 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x80 -; GFX8-NEXT: v_mov_b32_e32 v0, s68 -; GFX8-NEXT: v_mov_b32_e32 v1, s69 -; GFX8-NEXT: v_mov_b32_e32 v2, s66 -; GFX8-NEXT: v_mov_b32_e32 v3, s67 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xd0 +; GFX8-NEXT: v_mov_b32_e32 v0, s62 +; GFX8-NEXT: v_mov_b32_e32 v1, s63 +; GFX8-NEXT: v_mov_b32_e32 v2, s60 +; GFX8-NEXT: v_mov_b32_e32 v3, s61 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x70 -; GFX8-NEXT: v_mov_b32_e32 v0, s64 -; GFX8-NEXT: v_mov_b32_e32 v1, s65 -; GFX8-NEXT: v_mov_b32_e32 v2, s62 -; GFX8-NEXT: v_mov_b32_e32 v3, s63 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xc0 +; GFX8-NEXT: v_mov_b32_e32 v0, s58 +; GFX8-NEXT: v_mov_b32_e32 v1, s59 +; GFX8-NEXT: v_mov_b32_e32 v2, s56 +; GFX8-NEXT: v_mov_b32_e32 v3, s57 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x60 -; GFX8-NEXT: v_mov_b32_e32 v0, s60 -; GFX8-NEXT: v_mov_b32_e32 v1, s61 -; GFX8-NEXT: v_mov_b32_e32 v2, s58 -; GFX8-NEXT: v_mov_b32_e32 v3, s59 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xb0 +; GFX8-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NEXT: v_mov_b32_e32 v2, s52 +; GFX8-NEXT: v_mov_b32_e32 v3, s53 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 0x50 -; GFX8-NEXT: v_mov_b32_e32 v0, s56 -; GFX8-NEXT: v_mov_b32_e32 v1, s57 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0xa0 +; GFX8-NEXT: v_mov_b32_e32 v0, s50 +; GFX8-NEXT: v_mov_b32_e32 v1, s51 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0x90 +; GFX8-NEXT: v_mov_b32_e32 v0, s46 +; GFX8-NEXT: v_mov_b32_e32 v1, s47 +; GFX8-NEXT: v_mov_b32_e32 v2, s44 +; GFX8-NEXT: v_mov_b32_e32 v3, s45 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0x80 +; GFX8-NEXT: v_mov_b32_e32 v0, s24 +; GFX8-NEXT: v_mov_b32_e32 v1, s25 +; GFX8-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NEXT: v_mov_b32_e32 v3, s21 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0x70 +; GFX8-NEXT: v_mov_b32_e32 v0, s26 +; GFX8-NEXT: v_mov_b32_e32 v1, s27 +; GFX8-NEXT: v_mov_b32_e32 v2, s30 +; GFX8-NEXT: v_mov_b32_e32 v3, s31 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s28 +; GFX8-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NEXT: v_mov_b32_e32 v2, s54 ; GFX8-NEXT: v_mov_b32_e32 v3, s55 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_readlane_b32 s0, v50, 62 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 64 -; GFX8-NEXT: v_mov_b32_e32 v0, s52 -; GFX8-NEXT: v_mov_b32_e32 v1, s53 -; GFX8-NEXT: v_mov_b32_e32 v2, s50 -; GFX8-NEXT: v_mov_b32_e32 v3, s51 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_readlane_b32 s1, v50, 63 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0x60 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v2, s48 +; GFX8-NEXT: v_mov_b32_e32 v3, s49 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NEXT: v_mov_b32_e32 v0, s48 -; GFX8-NEXT: v_mov_b32_e32 v1, s49 -; GFX8-NEXT: v_mov_b32_e32 v2, s46 -; GFX8-NEXT: v_mov_b32_e32 v3, s47 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s42 +; GFX8-NEXT: v_mov_b32_e32 v1, s43 +; GFX8-NEXT: v_mov_b32_e32 v2, s40 +; GFX8-NEXT: v_mov_b32_e32 v3, s41 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_readlane_b32 s0, v50, 60 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_mov_b32_e32 v0, s44 -; GFX8-NEXT: v_mov_b32_e32 v1, s45 -; GFX8-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_readlane_b32 s1, v50, 61 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_readlane_b32 s0, v50, 58 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readlane_b32 s1, v50, 59 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 64 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_readlane_b32 s0, v50, 56 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s40 -; GFX8-NEXT: v_mov_b32_e32 v1, s41 -; GFX8-NEXT: v_mov_b32_e32 v2, s38 -; GFX8-NEXT: v_mov_b32_e32 v3, s39 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_readlane_b32 s1, v50, 57 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_readlane_b32 s0, v50, 54 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readlane_b32 s1, v50, 55 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 48 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_readlane_b32 s0, v50, 52 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s36 -; GFX8-NEXT: v_mov_b32_e32 v1, s37 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_readlane_b32 s1, v50, 53 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_readlane_b32 s0, v50, 50 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readlane_b32 s1, v50, 51 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 32 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_readlane_b32 s0, v50, 48 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_readlane_b32 s2, v62, 4 -; GFX8-NEXT: v_readlane_b32 s3, v62, 5 +; GFX8-NEXT: v_readlane_b32 s1, v50, 49 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_readlane_b32 s0, v50, 46 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readlane_b32 s1, v50, 47 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_add_u32 s0, s8, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s30 -; GFX8-NEXT: v_mov_b32_e32 v1, s31 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23] -; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27] +; GFX8-NEXT: v_readlane_b32 s0, v50, 42 +; GFX8-NEXT: v_readlane_b32 s1, v50, 43 +; GFX8-NEXT: v_mov_b32_e32 v49, s39 +; GFX8-NEXT: v_mov_b32_e32 v24, s34 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readlane_b32 s0, v50, 44 +; GFX8-NEXT: v_readlane_b32 s1, v50, 45 +; GFX8-NEXT: v_mov_b32_e32 v48, s38 +; GFX8-NEXT: v_mov_b32_e32 v25, s35 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[16:19] +; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[32:35] ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -10961,512 +11075,724 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX12-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b128 s[4:7], s[4:5], 0x24 +; GFX12-NEXT: ; implicit-def: $vgpr34 : SGPR spill to VGPR lane +; GFX12-NEXT: ; implicit-def: $vgpr33 : SGPR spill to VGPR lane ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b64 s[10:11], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[26:27], s[6:7], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s96, s11, 30 -; GFX12-NEXT: s_lshr_b32 s98, s11, 31 -; GFX12-NEXT: s_lshr_b32 s92, s11, 28 -; GFX12-NEXT: s_lshr_b32 s94, s11, 29 -; GFX12-NEXT: s_lshr_b32 s78, s11, 26 -; GFX12-NEXT: s_lshr_b32 s88, s11, 27 -; GFX12-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[100:101], s[98:99], 0x10000 -; GFX12-NEXT: s_lshr_b32 s66, s11, 24 -; GFX12-NEXT: s_lshr_b32 s74, s11, 25 -; GFX12-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[94:95], s[94:95], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s96 -; GFX12-NEXT: s_lshr_b32 s56, s11, 22 -; GFX12-NEXT: s_lshr_b32 s62, s11, 23 -; GFX12-NEXT: v_dual_mov_b32 v2, s97 :: v_dual_mov_b32 v3, s100 -; GFX12-NEXT: v_dual_mov_b32 v4, s101 :: v_dual_mov_b32 v5, s92 -; GFX12-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 +; GFX12-NEXT: s_lshr_b32 s0, s27, 22 +; GFX12-NEXT: s_lshr_b32 s80, s27, 30 +; GFX12-NEXT: v_writelane_b32 v34, s0, 0 +; GFX12-NEXT: s_lshr_b32 s0, s27, 23 +; GFX12-NEXT: s_lshr_b32 s82, s27, 31 +; GFX12-NEXT: s_lshr_b32 s90, s27, 28 +; GFX12-NEXT: s_lshr_b32 s92, s27, 29 +; GFX12-NEXT: v_writelane_b32 v34, s1, 1 +; GFX12-NEXT: s_lshr_b32 vcc_lo, s27, 26 +; GFX12-NEXT: s_lshr_b32 s98, s27, 27 +; GFX12-NEXT: s_lshr_b32 s96, s27, 24 +; GFX12-NEXT: s_lshr_b32 s94, s27, 25 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v34, s0, 2 +; GFX12-NEXT: s_lshr_b32 s0, s27, 20 +; GFX12-NEXT: s_lshr_b32 s100, s27, 11 +; GFX12-NEXT: s_lshr_b32 s102, s27, 8 +; GFX12-NEXT: s_lshr_b32 s84, s27, 9 +; GFX12-NEXT: v_writelane_b32 v34, s1, 3 +; GFX12-NEXT: s_lshr_b32 s76, s27, 6 +; GFX12-NEXT: s_lshr_b32 s54, s27, 7 +; GFX12-NEXT: s_lshr_b32 s28, s27, 4 +; GFX12-NEXT: s_lshr_b32 s24, s27, 5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v34, s0, 4 +; GFX12-NEXT: s_lshr_b32 s0, s27, 21 +; GFX12-NEXT: s_lshr_b32 s22, s27, 2 +; GFX12-NEXT: s_lshr_b32 s20, s27, 3 +; GFX12-NEXT: s_lshr_b32 s18, s27, 1 +; GFX12-NEXT: v_writelane_b32 v34, s1, 5 +; GFX12-NEXT: s_mov_b32 s16, s27 +; GFX12-NEXT: s_lshr_b32 s88, s26, 30 +; GFX12-NEXT: s_lshr_b32 s86, s26, 31 +; GFX12-NEXT: s_lshr_b32 s78, s26, 28 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v34, s0, 6 +; GFX12-NEXT: s_lshr_b32 s0, s27, 18 +; GFX12-NEXT: s_lshr_b32 s74, s26, 29 +; GFX12-NEXT: s_lshr_b32 s66, s26, 26 +; GFX12-NEXT: s_lshr_b32 s60, s26, 27 +; GFX12-NEXT: v_writelane_b32 v34, s1, 7 +; GFX12-NEXT: s_lshr_b32 s56, s26, 24 +; GFX12-NEXT: s_lshr_b32 s52, s26, 25 +; GFX12-NEXT: s_lshr_b32 s50, s26, 22 +; GFX12-NEXT: s_lshr_b32 s48, s26, 23 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v34, s0, 8 +; GFX12-NEXT: s_lshr_b32 s0, s27, 19 +; GFX12-NEXT: s_lshr_b32 s46, s26, 20 +; GFX12-NEXT: s_lshr_b32 s44, s26, 21 +; GFX12-NEXT: s_lshr_b32 s42, s26, 18 +; GFX12-NEXT: v_writelane_b32 v34, s1, 9 +; GFX12-NEXT: s_lshr_b32 s40, s26, 19 +; GFX12-NEXT: s_lshr_b32 s38, s26, 16 +; GFX12-NEXT: s_lshr_b32 s36, s26, 17 +; GFX12-NEXT: s_lshr_b32 s34, s26, 14 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v34, s0, 10 +; GFX12-NEXT: s_lshr_b32 s0, s27, 16 +; GFX12-NEXT: s_lshr_b32 s30, s26, 15 +; GFX12-NEXT: s_lshr_b32 s14, s26, 12 +; GFX12-NEXT: s_lshr_b32 s58, s26, 13 +; GFX12-NEXT: v_writelane_b32 v34, s1, 11 +; GFX12-NEXT: s_lshr_b32 s62, s26, 10 +; GFX12-NEXT: s_lshr_b32 s64, s26, 11 +; GFX12-NEXT: s_lshr_b32 s68, s26, 8 +; GFX12-NEXT: s_lshr_b32 s70, s26, 9 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v34, s0, 12 +; GFX12-NEXT: s_lshr_b32 s0, s27, 17 +; GFX12-NEXT: s_lshr_b32 s72, s26, 6 +; GFX12-NEXT: s_lshr_b32 s12, s26, 7 +; GFX12-NEXT: s_lshr_b32 s10, s26, 4 +; GFX12-NEXT: v_writelane_b32 v34, s1, 13 +; GFX12-NEXT: s_lshr_b32 s8, s26, 5 +; GFX12-NEXT: s_lshr_b32 s6, s26, 2 +; GFX12-NEXT: s_lshr_b32 s2, s26, 3 +; GFX12-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v34, s0, 14 +; GFX12-NEXT: s_lshr_b32 s0, s27, 14 ; GFX12-NEXT: s_bfe_i64 s[88:89], s[88:89], 0x10000 -; GFX12-NEXT: s_lshr_b32 s44, s11, 20 -; GFX12-NEXT: s_lshr_b32 s52, s11, 21 -; GFX12-NEXT: s_lshr_b32 s30, s11, 18 -; GFX12-NEXT: s_lshr_b32 s40, s11, 19 -; GFX12-NEXT: s_lshr_b32 s18, s11, 16 -; GFX12-NEXT: s_lshr_b32 s26, s11, 17 -; GFX12-NEXT: s_lshr_b32 s2, s11, 14 -; GFX12-NEXT: s_lshr_b32 s4, s11, 15 -; GFX12-NEXT: v_dual_mov_b32 v6, s93 :: v_dual_mov_b32 v7, s94 -; GFX12-NEXT: v_dual_mov_b32 v8, s95 :: v_dual_mov_b32 v9, s78 -; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 -; GFX12-NEXT: s_lshr_b32 s6, s11, 12 -; GFX12-NEXT: s_lshr_b32 s8, s11, 13 -; GFX12-NEXT: v_dual_mov_b32 v10, s79 :: v_dual_mov_b32 v11, s88 -; GFX12-NEXT: v_dual_mov_b32 v12, s89 :: v_dual_mov_b32 v13, s66 +; GFX12-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 +; GFX12-NEXT: v_writelane_b32 v34, s1, 15 +; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v34, s0, 16 +; GFX12-NEXT: s_lshr_b32 s0, s27, 15 +; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 -; GFX12-NEXT: s_lshr_b32 s12, s11, 10 -; GFX12-NEXT: s_lshr_b32 s14, s11, 11 -; GFX12-NEXT: v_dual_mov_b32 v14, s67 :: v_dual_mov_b32 v15, s74 -; GFX12-NEXT: v_dual_mov_b32 v16, s75 :: v_dual_mov_b32 v17, s56 ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX12-NEXT: v_writelane_b32 v34, s1, 17 +; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX12-NEXT: s_lshr_b32 s16, s11, 8 -; GFX12-NEXT: s_lshr_b32 s20, s11, 9 -; GFX12-NEXT: v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v19, s62 -; GFX12-NEXT: v_dual_mov_b32 v20, s63 :: v_dual_mov_b32 v21, s44 -; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX12-NEXT: s_lshr_b32 s22, s11, 6 -; GFX12-NEXT: s_lshr_b32 s24, s11, 7 -; GFX12-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s52 -; GFX12-NEXT: v_dual_mov_b32 v24, s53 :: v_dual_mov_b32 v25, s30 -; GFX12-NEXT: v_dual_mov_b32 v26, s31 :: v_dual_mov_b32 v27, s40 -; GFX12-NEXT: v_dual_mov_b32 v28, s41 :: v_dual_mov_b32 v29, s18 -; GFX12-NEXT: v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s26 -; GFX12-NEXT: v_mov_b32_e32 v32, s27 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX12-NEXT: s_clause 0x7 -; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:496 -; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:480 -; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:464 -; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:448 -; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:432 -; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:416 -; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:400 -; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:384 -; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 -; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 -; GFX12-NEXT: v_mov_b32_e32 v5, s6 -; GFX12-NEXT: s_lshr_b32 s28, s11, 4 -; GFX12-NEXT: s_lshr_b32 s34, s11, 5 -; GFX12-NEXT: s_lshr_b32 s36, s11, 2 -; GFX12-NEXT: s_lshr_b32 s38, s11, 3 -; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 -; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s12 -; GFX12-NEXT: s_lshr_b32 s42, s11, 1 -; GFX12-NEXT: s_mov_b32 s46, s11 -; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 -; GFX12-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 -; GFX12-NEXT: s_lshr_b32 s48, s10, 30 -; GFX12-NEXT: s_lshr_b32 s50, s10, 31 -; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v34, s0, 18 +; GFX12-NEXT: s_lshr_b32 s0, s27, 12 +; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s20 -; GFX12-NEXT: v_dual_mov_b32 v16, s21 :: v_dual_mov_b32 v17, s22 -; GFX12-NEXT: s_lshr_b32 s54, s10, 28 -; GFX12-NEXT: s_lshr_b32 s58, s10, 29 -; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v18, s23 :: v_dual_mov_b32 v19, s24 -; GFX12-NEXT: v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v21, s28 -; GFX12-NEXT: s_lshr_b32 s60, s10, 26 -; GFX12-NEXT: s_lshr_b32 s64, s10, 27 -; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s34 -; GFX12-NEXT: v_mov_b32_e32 v24, s35 -; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:368 -; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:352 -; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:336 -; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:320 -; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:304 -; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:288 -; GFX12-NEXT: v_dual_mov_b32 v1, s36 :: v_dual_mov_b32 v2, s37 -; GFX12-NEXT: v_dual_mov_b32 v3, s38 :: v_dual_mov_b32 v4, s39 -; GFX12-NEXT: v_mov_b32_e32 v5, s46 -; GFX12-NEXT: s_lshr_b32 s68, s10, 24 -; GFX12-NEXT: s_lshr_b32 s70, s10, 25 -; GFX12-NEXT: s_lshr_b32 s72, s10, 22 -; GFX12-NEXT: s_lshr_b32 s76, s10, 23 +; GFX12-NEXT: v_writelane_b32 v34, s1, 19 +; GFX12-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v6, s47 :: v_dual_mov_b32 v7, s42 -; GFX12-NEXT: v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v9, s48 -; GFX12-NEXT: s_lshr_b32 s80, s10, 20 -; GFX12-NEXT: s_lshr_b32 s82, s10, 21 +; GFX12-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v10, s49 :: v_dual_mov_b32 v11, s50 -; GFX12-NEXT: v_dual_mov_b32 v12, s51 :: v_dual_mov_b32 v13, s54 -; GFX12-NEXT: s_lshr_b32 s84, s10, 18 -; GFX12-NEXT: s_lshr_b32 s86, s10, 19 -; GFX12-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v34, s0, 20 +; GFX12-NEXT: s_lshr_b32 s0, s27, 13 ; GFX12-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v14, s55 :: v_dual_mov_b32 v15, s58 -; GFX12-NEXT: v_dual_mov_b32 v16, s59 :: v_dual_mov_b32 v17, s60 -; GFX12-NEXT: s_lshr_b32 s90, s10, 16 -; GFX12-NEXT: s_lshr_b32 s98, s10, 17 +; GFX12-NEXT: v_writelane_b32 v34, s1, 21 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v34, s0, 22 +; GFX12-NEXT: s_lshr_b32 s0, s27, 10 +; GFX12-NEXT: v_writelane_b32 v34, s1, 23 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v34, s0, 24 +; GFX12-NEXT: s_lshr_b32 s0, s26, 1 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX12-NEXT: v_writelane_b32 v34, s1, 25 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 +; GFX12-NEXT: v_writelane_b32 v34, s26, 26 +; GFX12-NEXT: v_writelane_b32 v34, s27, 27 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[14:15], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[20:21], 0x10000 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v34, s0, 28 +; GFX12-NEXT: v_writelane_b32 v34, s1, 29 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[80:81], 0x10000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v34, s0, 30 +; GFX12-NEXT: v_writelane_b32 v34, s1, 31 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[6:7], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[18:19], 0x10000 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v33, s0, 0 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[22:23], 0x10000 +; GFX12-NEXT: v_readlane_b32 s80, v34, 12 +; GFX12-NEXT: v_readlane_b32 s81, v34, 13 +; GFX12-NEXT: v_readlane_b32 s22, v34, 2 +; GFX12-NEXT: v_writelane_b32 v33, s1, 1 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[8:9], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[82:83], 0x10000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s8 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v33, s0, 2 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[90:91], 0x10000 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s2 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[92:93], 0x10000 +; GFX12-NEXT: v_writelane_b32 v33, s1, 3 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[10:11], vcc, 0x10000 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10 +; GFX12-NEXT: v_writelane_b32 v33, s0, 4 +; GFX12-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s8 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[28:29], 0x10000 +; GFX12-NEXT: v_writelane_b32 v33, s1, 5 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[12:13], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[98:99], 0x10000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_writelane_b32 v33, s0, 6 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[54:55], 0x10000 +; GFX12-NEXT: v_readlane_b32 s54, v34, 8 +; GFX12-NEXT: v_readlane_b32 s55, v34, 9 +; GFX12-NEXT: v_readlane_b32 s24, v34, 4 +; GFX12-NEXT: v_writelane_b32 v33, s1, 7 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[16:17], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[96:97], 0x10000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s16 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[94:95], 0x10000 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s12 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[76:77], 0x10000 +; GFX12-NEXT: v_readlane_b32 s76, v34, 10 +; GFX12-NEXT: v_readlane_b32 s77, v34, 11 +; GFX12-NEXT: v_readlane_b32 s25, v34, 5 +; GFX12-NEXT: v_readlane_b32 s28, v34, 6 +; GFX12-NEXT: v_readlane_b32 s29, v34, 7 +; GFX12-NEXT: v_readlane_b32 s23, v34, 3 +; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 +; GFX12-NEXT: v_readlane_b32 s20, v34, 0 +; GFX12-NEXT: v_readlane_b32 s21, v34, 1 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v26, s55 :: v_dual_mov_b32 v27, s76 +; GFX12-NEXT: v_dual_mov_b32 v28, s77 :: v_dual_mov_b32 v29, s80 +; GFX12-NEXT: v_readlane_b32 s76, v34, 18 +; GFX12-NEXT: v_readlane_b32 s77, v34, 19 +; GFX12-NEXT: v_readlane_b32 s82, v34, 16 +; GFX12-NEXT: v_readlane_b32 s83, v34, 17 +; GFX12-NEXT: v_mov_b32_e32 v30, s81 +; GFX12-NEXT: v_readlane_b32 s80, v34, 14 +; GFX12-NEXT: v_readlane_b32 s81, v34, 15 +; GFX12-NEXT: v_dual_mov_b32 v22, s25 :: v_dual_mov_b32 v23, s28 +; GFX12-NEXT: v_dual_mov_b32 v24, s29 :: v_dual_mov_b32 v25, s54 +; GFX12-NEXT: v_readlane_b32 s28, v34, 22 +; GFX12-NEXT: v_readlane_b32 s29, v34, 23 +; GFX12-NEXT: v_readlane_b32 s54, v34, 20 +; GFX12-NEXT: v_readlane_b32 s55, v34, 21 +; GFX12-NEXT: v_dual_mov_b32 v20, s23 :: v_dual_mov_b32 v21, s24 +; GFX12-NEXT: v_readlane_b32 s24, v34, 24 +; GFX12-NEXT: v_readlane_b32 s25, v34, 25 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v17, s20 +; GFX12-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v18, s61 :: v_dual_mov_b32 v19, s64 -; GFX12-NEXT: v_dual_mov_b32 v20, s65 :: v_dual_mov_b32 v21, s68 -; GFX12-NEXT: s_lshr_b32 s96, s10, 14 -; GFX12-NEXT: s_lshr_b32 s100, s10, 15 -; GFX12-NEXT: s_lshr_b32 s94, s10, 13 -; GFX12-NEXT: s_lshr_b32 s88, s10, 11 -; GFX12-NEXT: s_lshr_b32 s74, s10, 9 -; GFX12-NEXT: s_lshr_b32 s62, s10, 7 -; GFX12-NEXT: s_lshr_b32 s52, s10, 5 -; GFX12-NEXT: s_lshr_b32 s40, s10, 3 -; GFX12-NEXT: s_lshr_b32 s26, s10, 1 -; GFX12-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v22, s69 :: v_dual_mov_b32 v23, s70 -; GFX12-NEXT: v_mov_b32_e32 v24, s71 -; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:272 -; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:256 -; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:240 -; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:224 -; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:208 -; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:192 -; GFX12-NEXT: v_dual_mov_b32 v1, s72 :: v_dual_mov_b32 v2, s73 -; GFX12-NEXT: v_dual_mov_b32 v3, s76 :: v_dual_mov_b32 v4, s77 -; GFX12-NEXT: v_mov_b32_e32 v5, s80 -; GFX12-NEXT: s_lshr_b32 s92, s10, 12 -; GFX12-NEXT: s_lshr_b32 s78, s10, 10 -; GFX12-NEXT: s_bfe_i64 s[98:99], s[98:99], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[90:91], s[90:91], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v6, s81 :: v_dual_mov_b32 v7, s82 -; GFX12-NEXT: v_dual_mov_b32 v8, s83 :: v_dual_mov_b32 v9, s84 -; GFX12-NEXT: s_lshr_b32 s66, s10, 8 -; GFX12-NEXT: s_lshr_b32 s56, s10, 6 -; GFX12-NEXT: s_lshr_b32 s44, s10, 4 -; GFX12-NEXT: s_lshr_b32 s30, s10, 2 -; GFX12-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x10000 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[52:53], s[62:63], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[62:63], s[74:75], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[74:75], s[88:89], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[88:89], s[94:95], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[94:95], s[100:101], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v10, s85 :: v_dual_mov_b32 v11, s86 -; GFX12-NEXT: v_dual_mov_b32 v12, s87 :: v_dual_mov_b32 v13, s90 -; GFX12-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v14, s91 :: v_dual_mov_b32 v15, s98 +; GFX12-NEXT: v_dual_mov_b32 v18, s21 :: v_dual_mov_b32 v19, s22 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[100:101], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[4:5] offset:496 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[4:5] offset:480 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v16, s99 :: v_dual_mov_b32 v17, s96 -; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v18, s97 :: v_dual_mov_b32 v19, s94 -; GFX12-NEXT: v_dual_mov_b32 v20, s95 :: v_dual_mov_b32 v21, s92 -; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v22, s93 :: v_dual_mov_b32 v23, s88 -; GFX12-NEXT: v_mov_b32_e32 v24, s89 +; GFX12-NEXT: v_dual_mov_b32 v32, s81 :: v_dual_mov_b32 v1, s82 +; GFX12-NEXT: v_dual_mov_b32 v2, s83 :: v_dual_mov_b32 v3, s76 +; GFX12-NEXT: v_dual_mov_b32 v4, s77 :: v_dual_mov_b32 v5, s54 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[84:85], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[102:103], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v6, s55 :: v_dual_mov_b32 v7, s28 +; GFX12-NEXT: v_dual_mov_b32 v8, s29 :: v_dual_mov_b32 v31, s80 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:176 -; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:160 -; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:144 -; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:128 -; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v1, s78 :: v_dual_mov_b32 v2, s79 -; GFX12-NEXT: v_dual_mov_b32 v3, s74 :: v_dual_mov_b32 v4, s75 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v5, s66 -; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v6, s67 :: v_dual_mov_b32 v7, s62 -; GFX12-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s56 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v10, s57 :: v_dual_mov_b32 v11, s52 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[4:5] offset:464 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[4:5] offset:448 +; GFX12-NEXT: global_store_b128 v0, v[17:20], s[4:5] offset:432 +; GFX12-NEXT: global_store_b128 v0, v[21:24], s[4:5] offset:416 +; GFX12-NEXT: global_store_b128 v0, v[25:28], s[4:5] offset:400 +; GFX12-NEXT: global_store_b128 v0, v[29:32], s[4:5] offset:384 +; GFX12-NEXT: v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25 +; GFX12-NEXT: v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v12, s53 :: v_dual_mov_b32 v13, s44 -; GFX12-NEXT: v_dual_mov_b32 v14, s45 :: v_dual_mov_b32 v15, s40 -; GFX12-NEXT: v_dual_mov_b32 v16, s41 :: v_dual_mov_b32 v17, s30 -; GFX12-NEXT: v_dual_mov_b32 v18, s31 :: v_dual_mov_b32 v19, s26 -; GFX12-NEXT: v_dual_mov_b32 v20, s27 :: v_dual_mov_b32 v21, s18 -; GFX12-NEXT: v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s10 -; GFX12-NEXT: v_mov_b32_e32 v24, s11 +; GFX12-NEXT: v_dual_mov_b32 v13, s20 :: v_dual_mov_b32 v14, s21 +; GFX12-NEXT: v_dual_mov_b32 v15, s12 :: v_dual_mov_b32 v16, s13 +; GFX12-NEXT: v_dual_mov_b32 v17, s16 :: v_dual_mov_b32 v18, s17 +; GFX12-NEXT: v_dual_mov_b32 v19, s10 :: v_dual_mov_b32 v20, s11 +; GFX12-NEXT: v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v22, s9 +; GFX12-NEXT: v_dual_mov_b32 v23, s2 :: v_dual_mov_b32 v24, s3 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[4:5] offset:368 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[4:5] offset:352 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[4:5] offset:336 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[4:5] offset:320 +; GFX12-NEXT: global_store_b128 v0, v[17:20], s[4:5] offset:304 +; GFX12-NEXT: global_store_b128 v0, v[21:24], s[4:5] offset:288 +; GFX12-NEXT: v_dual_mov_b32 v1, s18 :: v_dual_mov_b32 v2, s19 +; GFX12-NEXT: v_dual_mov_b32 v3, s14 :: v_dual_mov_b32 v4, s15 +; GFX12-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1 +; GFX12-NEXT: v_dual_mov_b32 v7, s6 :: v_dual_mov_b32 v8, s7 +; GFX12-NEXT: v_dual_mov_b32 v9, s88 :: v_dual_mov_b32 v10, s89 +; GFX12-NEXT: v_dual_mov_b32 v11, s86 :: v_dual_mov_b32 v12, s87 +; GFX12-NEXT: v_dual_mov_b32 v13, s78 :: v_dual_mov_b32 v14, s79 +; GFX12-NEXT: v_dual_mov_b32 v15, s74 :: v_dual_mov_b32 v16, s75 +; GFX12-NEXT: v_dual_mov_b32 v17, s66 :: v_dual_mov_b32 v18, s67 +; GFX12-NEXT: v_dual_mov_b32 v19, s60 :: v_dual_mov_b32 v20, s61 +; GFX12-NEXT: v_dual_mov_b32 v21, s56 :: v_dual_mov_b32 v22, s57 +; GFX12-NEXT: v_dual_mov_b32 v23, s52 :: v_dual_mov_b32 v24, s53 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[4:5] offset:272 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[4:5] offset:256 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[4:5] offset:240 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[4:5] offset:224 +; GFX12-NEXT: global_store_b128 v0, v[17:20], s[4:5] offset:208 +; GFX12-NEXT: global_store_b128 v0, v[21:24], s[4:5] offset:192 +; GFX12-NEXT: v_dual_mov_b32 v1, s50 :: v_dual_mov_b32 v2, s51 +; GFX12-NEXT: v_dual_mov_b32 v3, s48 :: v_dual_mov_b32 v4, s49 +; GFX12-NEXT: v_mov_b32_e32 v5, s46 +; GFX12-NEXT: v_readlane_b32 s0, v33, 6 +; GFX12-NEXT: v_readlane_b32 s1, v33, 7 +; GFX12-NEXT: v_dual_mov_b32 v6, s47 :: v_dual_mov_b32 v7, s44 +; GFX12-NEXT: v_dual_mov_b32 v8, s45 :: v_dual_mov_b32 v9, s42 +; GFX12-NEXT: v_dual_mov_b32 v10, s43 :: v_dual_mov_b32 v11, s40 +; GFX12-NEXT: v_dual_mov_b32 v12, s41 :: v_dual_mov_b32 v13, s38 +; GFX12-NEXT: v_dual_mov_b32 v14, s39 :: v_dual_mov_b32 v15, s36 +; GFX12-NEXT: v_dual_mov_b32 v16, s37 :: v_dual_mov_b32 v17, s34 +; GFX12-NEXT: v_dual_mov_b32 v18, s35 :: v_dual_mov_b32 v19, s30 +; GFX12-NEXT: v_dual_mov_b32 v20, s31 :: v_dual_mov_b32 v21, s26 +; GFX12-NEXT: v_dual_mov_b32 v22, s27 :: v_dual_mov_b32 v23, s58 +; GFX12-NEXT: v_mov_b32_e32 v24, s59 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[4:5] offset:176 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[4:5] offset:160 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[4:5] offset:144 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[4:5] offset:128 +; GFX12-NEXT: global_store_b128 v0, v[17:20], s[4:5] offset:112 +; GFX12-NEXT: global_store_b128 v0, v[21:24], s[4:5] offset:96 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: v_dual_mov_b32 v10, s73 :: v_dual_mov_b32 v11, s0 +; GFX12-NEXT: v_readlane_b32 s0, v33, 4 +; GFX12-NEXT: v_mov_b32_e32 v12, s1 +; GFX12-NEXT: v_readlane_b32 s1, v33, 5 +; GFX12-NEXT: v_dual_mov_b32 v1, s62 :: v_dual_mov_b32 v2, s63 +; GFX12-NEXT: v_mov_b32_e32 v3, s64 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: v_mov_b32_e32 v13, s0 +; GFX12-NEXT: v_readlane_b32 s0, v33, 2 +; GFX12-NEXT: v_mov_b32_e32 v14, s1 +; GFX12-NEXT: v_readlane_b32 s1, v33, 3 +; GFX12-NEXT: v_dual_mov_b32 v4, s65 :: v_dual_mov_b32 v5, s68 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: v_mov_b32_e32 v15, s0 +; GFX12-NEXT: v_readlane_b32 s0, v33, 0 +; GFX12-NEXT: v_mov_b32_e32 v16, s1 +; GFX12-NEXT: v_readlane_b32 s1, v33, 1 +; GFX12-NEXT: v_dual_mov_b32 v6, s69 :: v_dual_mov_b32 v7, s70 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: v_mov_b32_e32 v17, s0 +; GFX12-NEXT: v_readlane_b32 s0, v34, 30 +; GFX12-NEXT: v_mov_b32_e32 v18, s1 +; GFX12-NEXT: v_readlane_b32 s1, v34, 31 +; GFX12-NEXT: v_dual_mov_b32 v8, s71 :: v_dual_mov_b32 v9, s72 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: v_mov_b32_e32 v19, s0 +; GFX12-NEXT: v_readlane_b32 s0, v34, 26 +; GFX12-NEXT: v_mov_b32_e32 v20, s1 +; GFX12-NEXT: v_readlane_b32 s1, v34, 27 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v21, s0 +; GFX12-NEXT: v_readlane_b32 s0, v34, 28 +; GFX12-NEXT: v_mov_b32_e32 v22, s1 +; GFX12-NEXT: v_readlane_b32 s1, v34, 29 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v23, s0 :: v_dual_mov_b32 v24, s1 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v0, v[1:4], s[4:5] offset:80 +; GFX12-NEXT: global_store_b128 v0, v[5:8], s[4:5] offset:64 +; GFX12-NEXT: global_store_b128 v0, v[9:12], s[4:5] offset:48 +; GFX12-NEXT: global_store_b128 v0, v[13:16], s[4:5] offset:32 +; GFX12-NEXT: global_store_b128 v0, v[17:20], s[4:5] offset:16 +; GFX12-NEXT: global_store_b128 v0, v[21:24], s[4:5] ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1250-NEXT: ; implicit-def: $vgpr34 : SGPR spill to VGPR lane +; GFX1250-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[10:11], s[2:3], 0x0 +; GFX1250-NEXT: s_load_b64 s[26:27], s[10:11], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_lshr_b32 s96, s11, 30 -; GFX1250-NEXT: s_lshr_b32 s98, s11, 31 -; GFX1250-NEXT: s_lshr_b32 s92, s11, 28 -; GFX1250-NEXT: s_lshr_b32 s94, s11, 29 -; GFX1250-NEXT: s_lshr_b32 s78, s11, 26 -; GFX1250-NEXT: s_lshr_b32 s88, s11, 27 -; GFX1250-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[100:101], s[98:99], 0x10000 -; GFX1250-NEXT: s_lshr_b32 s66, s11, 24 -; GFX1250-NEXT: s_lshr_b32 s74, s11, 25 -; GFX1250-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[94:95], s[94:95], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s96 -; GFX1250-NEXT: s_lshr_b32 s56, s11, 22 -; GFX1250-NEXT: s_lshr_b32 s62, s11, 23 -; GFX1250-NEXT: v_dual_mov_b32 v1, s97 :: v_dual_mov_b32 v2, s100 -; GFX1250-NEXT: v_dual_mov_b32 v3, s101 :: v_dual_mov_b32 v4, s92 -; GFX1250-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 +; GFX1250-NEXT: s_lshr_b32 s0, s27, 22 +; GFX1250-NEXT: s_lshr_b32 s80, s27, 30 +; GFX1250-NEXT: v_writelane_b32 v34, s0, 0 +; GFX1250-NEXT: s_lshr_b32 s0, s27, 23 +; GFX1250-NEXT: s_lshr_b32 s82, s27, 31 +; GFX1250-NEXT: s_lshr_b32 s90, s27, 28 +; GFX1250-NEXT: s_lshr_b32 s92, s27, 29 +; GFX1250-NEXT: v_writelane_b32 v34, s1, 1 +; GFX1250-NEXT: s_lshr_b32 vcc_lo, s27, 26 +; GFX1250-NEXT: s_lshr_b32 s98, s27, 27 +; GFX1250-NEXT: s_lshr_b32 s96, s27, 24 +; GFX1250-NEXT: s_lshr_b32 s94, s27, 25 +; GFX1250-NEXT: v_writelane_b32 v34, s0, 2 +; GFX1250-NEXT: s_lshr_b32 s0, s27, 20 +; GFX1250-NEXT: s_lshr_b32 s100, s27, 11 +; GFX1250-NEXT: s_lshr_b32 s102, s27, 8 +; GFX1250-NEXT: s_lshr_b32 s84, s27, 9 +; GFX1250-NEXT: v_writelane_b32 v34, s1, 3 +; GFX1250-NEXT: s_lshr_b32 s76, s27, 6 +; GFX1250-NEXT: s_lshr_b32 s54, s27, 7 +; GFX1250-NEXT: s_lshr_b32 s28, s27, 4 +; GFX1250-NEXT: s_lshr_b32 s24, s27, 5 +; GFX1250-NEXT: v_writelane_b32 v34, s0, 4 +; GFX1250-NEXT: s_lshr_b32 s0, s27, 21 +; GFX1250-NEXT: s_lshr_b32 s22, s27, 2 +; GFX1250-NEXT: s_lshr_b32 s20, s27, 3 +; GFX1250-NEXT: s_lshr_b32 s18, s27, 1 +; GFX1250-NEXT: v_writelane_b32 v34, s1, 5 +; GFX1250-NEXT: s_mov_b32 s16, s27 +; GFX1250-NEXT: s_lshr_b32 s88, s26, 30 +; GFX1250-NEXT: s_lshr_b32 s86, s26, 31 +; GFX1250-NEXT: s_lshr_b32 s78, s26, 28 +; GFX1250-NEXT: v_writelane_b32 v34, s0, 6 +; GFX1250-NEXT: s_lshr_b32 s0, s27, 18 +; GFX1250-NEXT: s_lshr_b32 s74, s26, 29 +; GFX1250-NEXT: s_lshr_b32 s66, s26, 26 +; GFX1250-NEXT: s_lshr_b32 s60, s26, 27 +; GFX1250-NEXT: v_writelane_b32 v34, s1, 7 +; GFX1250-NEXT: s_lshr_b32 s56, s26, 24 +; GFX1250-NEXT: s_lshr_b32 s52, s26, 25 +; GFX1250-NEXT: s_lshr_b32 s50, s26, 22 +; GFX1250-NEXT: s_lshr_b32 s48, s26, 23 +; GFX1250-NEXT: v_writelane_b32 v34, s0, 8 +; GFX1250-NEXT: s_lshr_b32 s0, s27, 19 +; GFX1250-NEXT: s_lshr_b32 s46, s26, 20 +; GFX1250-NEXT: s_lshr_b32 s44, s26, 21 +; GFX1250-NEXT: s_lshr_b32 s42, s26, 18 +; GFX1250-NEXT: v_writelane_b32 v34, s1, 9 +; GFX1250-NEXT: s_lshr_b32 s40, s26, 19 +; GFX1250-NEXT: s_lshr_b32 s38, s26, 16 +; GFX1250-NEXT: s_lshr_b32 s36, s26, 17 +; GFX1250-NEXT: s_lshr_b32 s34, s26, 14 +; GFX1250-NEXT: v_writelane_b32 v34, s0, 10 +; GFX1250-NEXT: s_lshr_b32 s0, s27, 16 +; GFX1250-NEXT: s_lshr_b32 s30, s26, 15 +; GFX1250-NEXT: s_lshr_b32 s14, s26, 12 +; GFX1250-NEXT: s_lshr_b32 s58, s26, 13 +; GFX1250-NEXT: v_writelane_b32 v34, s1, 11 +; GFX1250-NEXT: s_lshr_b32 s62, s26, 10 +; GFX1250-NEXT: s_lshr_b32 s64, s26, 11 +; GFX1250-NEXT: s_lshr_b32 s68, s26, 8 +; GFX1250-NEXT: s_lshr_b32 s70, s26, 9 +; GFX1250-NEXT: v_writelane_b32 v34, s0, 12 +; GFX1250-NEXT: s_lshr_b32 s0, s27, 17 +; GFX1250-NEXT: s_lshr_b32 s72, s26, 6 +; GFX1250-NEXT: s_lshr_b32 s12, s26, 7 +; GFX1250-NEXT: s_lshr_b32 s10, s26, 4 +; GFX1250-NEXT: v_writelane_b32 v34, s1, 13 +; GFX1250-NEXT: s_lshr_b32 s6, s26, 5 +; GFX1250-NEXT: s_lshr_b32 s4, s26, 2 +; GFX1250-NEXT: s_lshr_b32 s2, s26, 3 +; GFX1250-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 +; GFX1250-NEXT: v_writelane_b32 v34, s0, 14 +; GFX1250-NEXT: s_lshr_b32 s0, s27, 14 ; GFX1250-NEXT: s_bfe_i64 s[88:89], s[88:89], 0x10000 -; GFX1250-NEXT: s_lshr_b32 s44, s11, 20 -; GFX1250-NEXT: s_lshr_b32 s52, s11, 21 -; GFX1250-NEXT: s_lshr_b32 s30, s11, 18 -; GFX1250-NEXT: s_lshr_b32 s40, s11, 19 -; GFX1250-NEXT: s_lshr_b32 s18, s11, 16 -; GFX1250-NEXT: s_lshr_b32 s26, s11, 17 -; GFX1250-NEXT: s_lshr_b32 s2, s11, 14 -; GFX1250-NEXT: s_lshr_b32 s4, s11, 15 -; GFX1250-NEXT: v_dual_mov_b32 v5, s93 :: v_dual_mov_b32 v6, s94 -; GFX1250-NEXT: v_dual_mov_b32 v7, s95 :: v_dual_mov_b32 v10, s78 -; GFX1250-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 -; GFX1250-NEXT: s_lshr_b32 s6, s11, 12 -; GFX1250-NEXT: s_lshr_b32 s8, s11, 13 -; GFX1250-NEXT: v_dual_mov_b32 v11, s79 :: v_dual_mov_b32 v12, s88 -; GFX1250-NEXT: v_dual_mov_b32 v13, s89 :: v_dual_mov_b32 v14, s66 +; GFX1250-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 +; GFX1250-NEXT: v_writelane_b32 v34, s1, 15 +; GFX1250-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX1250-NEXT: v_writelane_b32 v34, s0, 16 +; GFX1250-NEXT: s_lshr_b32 s0, s27, 15 +; GFX1250-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 -; GFX1250-NEXT: s_lshr_b32 s12, s11, 10 -; GFX1250-NEXT: s_lshr_b32 s14, s11, 11 -; GFX1250-NEXT: v_dual_mov_b32 v15, s67 :: v_dual_mov_b32 v16, s74 -; GFX1250-NEXT: v_dual_mov_b32 v17, s75 :: v_dual_mov_b32 v18, s56 ; GFX1250-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX1250-NEXT: v_writelane_b32 v34, s1, 17 +; GFX1250-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX1250-NEXT: v_writelane_b32 v34, s0, 18 +; GFX1250-NEXT: s_lshr_b32 s0, s27, 12 +; GFX1250-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX1250-NEXT: v_writelane_b32 v34, s1, 19 +; GFX1250-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 +; GFX1250-NEXT: v_writelane_b32 v34, s0, 20 +; GFX1250-NEXT: s_lshr_b32 s0, s27, 13 +; GFX1250-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 +; GFX1250-NEXT: v_writelane_b32 v34, s1, 21 +; GFX1250-NEXT: v_writelane_b32 v34, s0, 22 +; GFX1250-NEXT: s_lshr_b32 s0, s27, 10 +; GFX1250-NEXT: v_writelane_b32 v34, s1, 23 +; GFX1250-NEXT: v_writelane_b32 v34, s0, 24 +; GFX1250-NEXT: s_lshr_b32 s0, s26, 1 ; GFX1250-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX1250-NEXT: s_lshr_b32 s16, s11, 8 -; GFX1250-NEXT: s_lshr_b32 s20, s11, 9 -; GFX1250-NEXT: v_dual_mov_b32 v19, s57 :: v_dual_mov_b32 v20, s62 -; GFX1250-NEXT: v_dual_mov_b32 v21, s63 :: v_dual_mov_b32 v22, s44 -; GFX1250-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX1250-NEXT: s_lshr_b32 s22, s11, 6 -; GFX1250-NEXT: s_lshr_b32 s24, s11, 7 -; GFX1250-NEXT: v_dual_mov_b32 v23, s45 :: v_dual_mov_b32 v24, s52 -; GFX1250-NEXT: v_dual_mov_b32 v25, s53 :: v_dual_mov_b32 v26, s30 -; GFX1250-NEXT: v_dual_mov_b32 v27, s31 :: v_dual_mov_b32 v28, s40 -; GFX1250-NEXT: v_dual_mov_b32 v29, s41 :: v_dual_mov_b32 v30, s18 -; GFX1250-NEXT: v_dual_mov_b32 v31, s19 :: v_dual_mov_b32 v32, s26 -; GFX1250-NEXT: v_mov_b32_e32 v33, s27 -; GFX1250-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX1250-NEXT: s_clause 0x7 -; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:496 -; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:480 -; GFX1250-NEXT: global_store_b128 v8, v[10:13], s[0:1] offset:464 -; GFX1250-NEXT: global_store_b128 v8, v[14:17], s[0:1] offset:448 -; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:432 -; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] offset:416 -; GFX1250-NEXT: global_store_b128 v8, v[26:29], s[0:1] offset:400 -; GFX1250-NEXT: global_store_b128 v8, v[30:33], s[0:1] offset:384 -; GFX1250-NEXT: s_wait_xcnt 0x7 -; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX1250-NEXT: s_wait_xcnt 0x6 -; GFX1250-NEXT: v_mov_b32_e32 v4, s6 -; GFX1250-NEXT: s_lshr_b32 s28, s11, 4 -; GFX1250-NEXT: s_lshr_b32 s34, s11, 5 -; GFX1250-NEXT: s_lshr_b32 s36, s11, 2 -; GFX1250-NEXT: s_lshr_b32 s38, s11, 3 +; GFX1250-NEXT: v_writelane_b32 v34, s1, 25 +; GFX1250-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 +; GFX1250-NEXT: v_writelane_b32 v34, s26, 26 +; GFX1250-NEXT: v_writelane_b32 v34, s27, 27 +; GFX1250-NEXT: s_bfe_i64 s[26:27], s[14:15], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[14:15], s[20:21], 0x10000 +; GFX1250-NEXT: v_writelane_b32 v34, s0, 28 +; GFX1250-NEXT: v_writelane_b32 v34, s1, 29 +; GFX1250-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[2:3], s[82:83], 0x10000 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_writelane_b32 v34, s0, 30 +; GFX1250-NEXT: v_writelane_b32 v34, s1, 31 +; GFX1250-NEXT: s_bfe_i64 s[0:1], s[4:5], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x10000 +; GFX1250-NEXT: v_writelane_b32 v5, s0, 0 +; GFX1250-NEXT: s_bfe_i64 s[16:17], s[94:95], 0x10000 +; GFX1250-NEXT: v_readlane_b32 s20, v34, 0 +; GFX1250-NEXT: v_readlane_b32 s21, v34, 1 +; GFX1250-NEXT: v_readlane_b32 s82, v34, 16 +; GFX1250-NEXT: v_writelane_b32 v5, s1, 1 +; GFX1250-NEXT: s_bfe_i64 s[0:1], s[6:7], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[6:7], s[18:19], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[18:19], s[22:23], 0x10000 +; GFX1250-NEXT: v_readlane_b32 s22, v34, 2 +; GFX1250-NEXT: v_writelane_b32 v5, s0, 2 +; GFX1250-NEXT: v_readlane_b32 s23, v34, 3 +; GFX1250-NEXT: v_readlane_b32 s83, v34, 17 ; GFX1250-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v5, s7 :: v_dual_mov_b32 v6, s8 -; GFX1250-NEXT: s_wait_xcnt 0x5 -; GFX1250-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v10, s12 -; GFX1250-NEXT: s_lshr_b32 s42, s11, 1 -; GFX1250-NEXT: s_mov_b32 s46, s11 -; GFX1250-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX1250-NEXT: v_mov_b32_e32 v16, s16 +; GFX1250-NEXT: v_writelane_b32 v5, s1, 3 +; GFX1250-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[10:11], s[98:99], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v11, s13 :: v_dual_mov_b32 v12, s14 -; GFX1250-NEXT: s_wait_xcnt 0x4 -; GFX1250-NEXT: v_dual_mov_b32 v13, s15 :: v_dual_mov_b32 v14, s16 -; GFX1250-NEXT: s_lshr_b32 s48, s10, 30 -; GFX1250-NEXT: s_lshr_b32 s50, s10, 31 -; GFX1250-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v17, s17 :: v_dual_mov_b32 v18, s20 +; GFX1250-NEXT: v_writelane_b32 v5, s0, 4 +; GFX1250-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v19, s21 :: v_dual_mov_b32 v20, s22 +; GFX1250-NEXT: s_bfe_i64 s[16:17], s[84:85], 0x10000 +; GFX1250-NEXT: v_writelane_b32 v5, s1, 5 +; GFX1250-NEXT: s_bfe_i64 s[0:1], s[12:13], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[12:13], s[96:97], 0x10000 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v14, s12 +; GFX1250-NEXT: v_writelane_b32 v5, s0, 6 +; GFX1250-NEXT: v_mov_b32_e32 v15, s13 +; GFX1250-NEXT: s_bfe_i64 s[12:13], s[76:77], 0x10000 +; GFX1250-NEXT: v_readlane_b32 s76, v34, 10 +; GFX1250-NEXT: v_readlane_b32 s77, v34, 11 +; GFX1250-NEXT: v_writelane_b32 v5, s1, 7 +; GFX1250-NEXT: s_bfe_i64 s[0:1], s[80:81], 0x10000 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_bfe_i64 s[0:1], s[90:91], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v6, s0 +; GFX1250-NEXT: s_bfe_i64 s[2:3], s[92:93], 0x10000 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v8, s2 +; GFX1250-NEXT: s_bfe_i64 s[0:1], vcc, 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v11, s1 +; GFX1250-NEXT: s_bfe_i64 s[10:11], s[54:55], 0x10000 +; GFX1250-NEXT: v_readlane_b32 s54, v34, 8 +; GFX1250-NEXT: v_readlane_b32 s55, v34, 9 +; GFX1250-NEXT: v_readlane_b32 s80, v34, 12 +; GFX1250-NEXT: v_readlane_b32 s81, v34, 13 +; GFX1250-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v10, s0 +; GFX1250-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[0:1], s[28:29], 0x10000 +; GFX1250-NEXT: v_readlane_b32 s24, v34, 4 +; GFX1250-NEXT: v_readlane_b32 s25, v34, 5 +; GFX1250-NEXT: v_readlane_b32 s28, v34, 6 +; GFX1250-NEXT: v_readlane_b32 s29, v34, 7 +; GFX1250-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v27, s55 :: v_dual_mov_b32 v28, s76 +; GFX1250-NEXT: v_dual_mov_b32 v29, s77 :: v_dual_mov_b32 v30, s80 +; GFX1250-NEXT: v_readlane_b32 s76, v34, 18 +; GFX1250-NEXT: v_readlane_b32 s77, v34, 19 +; GFX1250-NEXT: v_mov_b32_e32 v31, s81 +; GFX1250-NEXT: v_readlane_b32 s80, v34, 14 +; GFX1250-NEXT: v_readlane_b32 s81, v34, 15 +; GFX1250-NEXT: v_dual_mov_b32 v23, s25 :: v_dual_mov_b32 v24, s28 +; GFX1250-NEXT: v_dual_mov_b32 v25, s29 :: v_dual_mov_b32 v26, s54 +; GFX1250-NEXT: v_readlane_b32 s28, v34, 22 +; GFX1250-NEXT: v_readlane_b32 s29, v34, 23 +; GFX1250-NEXT: v_readlane_b32 s54, v34, 20 +; GFX1250-NEXT: v_readlane_b32 s55, v34, 21 +; GFX1250-NEXT: v_dual_mov_b32 v21, s23 :: v_dual_mov_b32 v22, s24 +; GFX1250-NEXT: v_readlane_b32 s24, v34, 24 +; GFX1250-NEXT: v_readlane_b32 s25, v34, 25 +; GFX1250-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v15, s17 :: v_dual_mov_b32 v16, s20 +; GFX1250-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[22:23], s[100:101], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[8:9] offset:496 +; GFX1250-NEXT: global_store_b128 v4, v[6:9], s[8:9] offset:480 +; GFX1250-NEXT: s_wait_xcnt 0x1 +; GFX1250-NEXT: v_dual_mov_b32 v33, s81 :: v_dual_mov_b32 v0, s82 +; GFX1250-NEXT: v_dual_mov_b32 v1, s83 :: v_dual_mov_b32 v2, s76 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v3, s77 :: v_dual_mov_b32 v6, s54 +; GFX1250-NEXT: s_bfe_i64 s[20:21], s[102:103], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v7, s55 :: v_dual_mov_b32 v8, s28 +; GFX1250-NEXT: v_dual_mov_b32 v9, s29 :: v_dual_mov_b32 v32, s80 +; GFX1250-NEXT: s_clause 0x5 +; GFX1250-NEXT: global_store_b128 v4, v[10:13], s[8:9] offset:464 +; GFX1250-NEXT: global_store_b128 v4, v[14:17], s[8:9] offset:448 +; GFX1250-NEXT: global_store_b128 v4, v[18:21], s[8:9] offset:432 +; GFX1250-NEXT: global_store_b128 v4, v[22:25], s[8:9] offset:416 +; GFX1250-NEXT: global_store_b128 v4, v[26:29], s[8:9] offset:400 +; GFX1250-NEXT: global_store_b128 v4, v[30:33], s[8:9] offset:384 +; GFX1250-NEXT: s_wait_xcnt 0x5 +; GFX1250-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25 +; GFX1250-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23 +; GFX1250-NEXT: s_wait_xcnt 0x4 +; GFX1250-NEXT: v_dual_mov_b32 v14, s20 :: v_dual_mov_b32 v15, s21 +; GFX1250-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17 ; GFX1250-NEXT: s_wait_xcnt 0x3 -; GFX1250-NEXT: v_dual_mov_b32 v17, s21 :: v_dual_mov_b32 v18, s22 -; GFX1250-NEXT: s_lshr_b32 s54, s10, 28 -; GFX1250-NEXT: s_lshr_b32 s58, s10, 29 -; GFX1250-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v19, s23 :: v_dual_mov_b32 v20, s24 +; GFX1250-NEXT: v_dual_mov_b32 v18, s12 :: v_dual_mov_b32 v19, s13 +; GFX1250-NEXT: v_dual_mov_b32 v20, s10 :: v_dual_mov_b32 v21, s11 ; GFX1250-NEXT: s_wait_xcnt 0x2 -; GFX1250-NEXT: v_dual_mov_b32 v21, s25 :: v_dual_mov_b32 v22, s28 -; GFX1250-NEXT: s_lshr_b32 s60, s10, 26 -; GFX1250-NEXT: s_lshr_b32 s64, s10, 27 -; GFX1250-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v23, s29 :: v_dual_mov_b32 v24, s34 -; GFX1250-NEXT: v_mov_b32_e32 v25, s35 +; GFX1250-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1 +; GFX1250-NEXT: v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3 ; GFX1250-NEXT: s_clause 0x5 -; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:368 -; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:352 -; GFX1250-NEXT: global_store_b128 v8, v[10:13], s[0:1] offset:336 -; GFX1250-NEXT: global_store_b128 v8, v[14:17], s[0:1] offset:320 -; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:304 -; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] offset:288 +; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[8:9] offset:368 +; GFX1250-NEXT: global_store_b128 v4, v[6:9], s[8:9] offset:352 +; GFX1250-NEXT: global_store_b128 v4, v[10:13], s[8:9] offset:336 +; GFX1250-NEXT: global_store_b128 v4, v[14:17], s[8:9] offset:320 +; GFX1250-NEXT: global_store_b128 v4, v[18:21], s[8:9] offset:304 +; GFX1250-NEXT: global_store_b128 v4, v[22:25], s[8:9] offset:288 ; GFX1250-NEXT: s_wait_xcnt 0x5 -; GFX1250-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 -; GFX1250-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX1250-NEXT: v_dual_mov_b32 v0, s18 :: v_dual_mov_b32 v1, s19 +; GFX1250-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 ; GFX1250-NEXT: s_wait_xcnt 0x4 -; GFX1250-NEXT: v_mov_b32_e32 v4, s46 -; GFX1250-NEXT: s_lshr_b32 s68, s10, 24 -; GFX1250-NEXT: s_lshr_b32 s70, s10, 25 -; GFX1250-NEXT: s_lshr_b32 s72, s10, 22 -; GFX1250-NEXT: s_lshr_b32 s76, s10, 23 -; GFX1250-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v5, s47 :: v_dual_mov_b32 v6, s42 +; GFX1250-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 ; GFX1250-NEXT: s_wait_xcnt 0x3 -; GFX1250-NEXT: v_dual_mov_b32 v7, s43 :: v_dual_mov_b32 v10, s48 -; GFX1250-NEXT: s_lshr_b32 s80, s10, 20 -; GFX1250-NEXT: s_lshr_b32 s82, s10, 21 -; GFX1250-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v11, s49 :: v_dual_mov_b32 v12, s50 +; GFX1250-NEXT: v_dual_mov_b32 v10, s88 :: v_dual_mov_b32 v11, s89 +; GFX1250-NEXT: v_dual_mov_b32 v12, s86 :: v_dual_mov_b32 v13, s87 ; GFX1250-NEXT: s_wait_xcnt 0x2 -; GFX1250-NEXT: v_dual_mov_b32 v13, s51 :: v_dual_mov_b32 v14, s54 -; GFX1250-NEXT: s_lshr_b32 s84, s10, 18 -; GFX1250-NEXT: s_lshr_b32 s86, s10, 19 -; GFX1250-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v15, s55 :: v_dual_mov_b32 v16, s58 +; GFX1250-NEXT: v_dual_mov_b32 v14, s78 :: v_dual_mov_b32 v15, s79 +; GFX1250-NEXT: v_dual_mov_b32 v16, s74 :: v_dual_mov_b32 v17, s75 ; GFX1250-NEXT: s_wait_xcnt 0x1 -; GFX1250-NEXT: v_dual_mov_b32 v17, s59 :: v_dual_mov_b32 v18, s60 -; GFX1250-NEXT: s_lshr_b32 s90, s10, 16 -; GFX1250-NEXT: s_lshr_b32 s98, s10, 17 -; GFX1250-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v19, s61 :: v_dual_mov_b32 v20, s64 +; GFX1250-NEXT: v_dual_mov_b32 v18, s66 :: v_dual_mov_b32 v19, s67 +; GFX1250-NEXT: v_dual_mov_b32 v20, s60 :: v_dual_mov_b32 v21, s61 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v21, s65 :: v_dual_mov_b32 v22, s68 -; GFX1250-NEXT: s_lshr_b32 s96, s10, 14 -; GFX1250-NEXT: s_lshr_b32 s100, s10, 15 -; GFX1250-NEXT: s_lshr_b32 s94, s10, 13 -; GFX1250-NEXT: s_lshr_b32 s88, s10, 11 -; GFX1250-NEXT: s_lshr_b32 s74, s10, 9 -; GFX1250-NEXT: s_lshr_b32 s62, s10, 7 -; GFX1250-NEXT: s_lshr_b32 s52, s10, 5 -; GFX1250-NEXT: s_lshr_b32 s40, s10, 3 -; GFX1250-NEXT: s_lshr_b32 s26, s10, 1 -; GFX1250-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v23, s69 :: v_dual_mov_b32 v24, s70 -; GFX1250-NEXT: v_mov_b32_e32 v25, s71 +; GFX1250-NEXT: v_dual_mov_b32 v22, s56 :: v_dual_mov_b32 v23, s57 +; GFX1250-NEXT: v_dual_mov_b32 v24, s52 :: v_dual_mov_b32 v25, s53 ; GFX1250-NEXT: s_clause 0x5 -; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:272 -; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:256 -; GFX1250-NEXT: global_store_b128 v8, v[10:13], s[0:1] offset:240 -; GFX1250-NEXT: global_store_b128 v8, v[14:17], s[0:1] offset:224 -; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:208 -; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] offset:192 +; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[8:9] offset:272 +; GFX1250-NEXT: global_store_b128 v4, v[6:9], s[8:9] offset:256 +; GFX1250-NEXT: global_store_b128 v4, v[10:13], s[8:9] offset:240 +; GFX1250-NEXT: global_store_b128 v4, v[14:17], s[8:9] offset:224 +; GFX1250-NEXT: global_store_b128 v4, v[18:21], s[8:9] offset:208 +; GFX1250-NEXT: global_store_b128 v4, v[22:25], s[8:9] offset:192 ; GFX1250-NEXT: s_wait_xcnt 0x5 -; GFX1250-NEXT: v_dual_mov_b32 v0, s72 :: v_dual_mov_b32 v1, s73 -; GFX1250-NEXT: v_dual_mov_b32 v2, s76 :: v_dual_mov_b32 v3, s77 +; GFX1250-NEXT: v_dual_mov_b32 v0, s50 :: v_dual_mov_b32 v1, s51 +; GFX1250-NEXT: v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49 ; GFX1250-NEXT: s_wait_xcnt 0x4 -; GFX1250-NEXT: v_mov_b32_e32 v4, s80 -; GFX1250-NEXT: s_lshr_b32 s92, s10, 12 -; GFX1250-NEXT: s_lshr_b32 s78, s10, 10 -; GFX1250-NEXT: s_bfe_i64 s[98:99], s[98:99], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[90:91], s[90:91], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v5, s81 :: v_dual_mov_b32 v6, s82 +; GFX1250-NEXT: v_mov_b32_e32 v6, s46 +; GFX1250-NEXT: v_readlane_b32 s0, v5, 6 +; GFX1250-NEXT: v_readlane_b32 s1, v5, 7 +; GFX1250-NEXT: v_dual_mov_b32 v7, s47 :: v_dual_mov_b32 v8, s44 ; GFX1250-NEXT: s_wait_xcnt 0x3 -; GFX1250-NEXT: v_dual_mov_b32 v7, s83 :: v_dual_mov_b32 v10, s84 -; GFX1250-NEXT: s_lshr_b32 s66, s10, 8 -; GFX1250-NEXT: s_lshr_b32 s56, s10, 6 -; GFX1250-NEXT: s_lshr_b32 s44, s10, 4 -; GFX1250-NEXT: s_lshr_b32 s30, s10, 2 -; GFX1250-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[52:53], s[62:63], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[62:63], s[74:75], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[74:75], s[88:89], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[88:89], s[94:95], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[94:95], s[100:101], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v11, s85 :: v_dual_mov_b32 v12, s86 +; GFX1250-NEXT: v_dual_mov_b32 v9, s45 :: v_dual_mov_b32 v10, s42 +; GFX1250-NEXT: v_dual_mov_b32 v11, s43 :: v_dual_mov_b32 v12, s40 ; GFX1250-NEXT: s_wait_xcnt 0x2 -; GFX1250-NEXT: v_dual_mov_b32 v13, s87 :: v_dual_mov_b32 v14, s90 -; GFX1250-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v15, s91 :: v_dual_mov_b32 v16, s98 +; GFX1250-NEXT: v_dual_mov_b32 v13, s41 :: v_dual_mov_b32 v14, s38 +; GFX1250-NEXT: v_dual_mov_b32 v15, s39 :: v_dual_mov_b32 v16, s36 ; GFX1250-NEXT: s_wait_xcnt 0x1 -; GFX1250-NEXT: v_dual_mov_b32 v17, s99 :: v_dual_mov_b32 v18, s96 -; GFX1250-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v19, s97 :: v_dual_mov_b32 v20, s94 +; GFX1250-NEXT: v_dual_mov_b32 v17, s37 :: v_dual_mov_b32 v18, s34 +; GFX1250-NEXT: v_dual_mov_b32 v19, s35 :: v_dual_mov_b32 v20, s30 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v21, s95 :: v_dual_mov_b32 v22, s92 -; GFX1250-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v23, s93 :: v_dual_mov_b32 v24, s88 -; GFX1250-NEXT: v_mov_b32_e32 v25, s89 +; GFX1250-NEXT: v_dual_mov_b32 v21, s31 :: v_dual_mov_b32 v22, s26 +; GFX1250-NEXT: v_dual_mov_b32 v23, s27 :: v_dual_mov_b32 v24, s58 +; GFX1250-NEXT: v_mov_b32_e32 v25, s59 ; GFX1250-NEXT: s_clause 0x5 -; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:176 -; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:160 -; GFX1250-NEXT: global_store_b128 v8, v[10:13], s[0:1] offset:144 -; GFX1250-NEXT: global_store_b128 v8, v[14:17], s[0:1] offset:128 -; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:112 -; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] offset:96 -; GFX1250-NEXT: s_wait_xcnt 0x5 -; GFX1250-NEXT: v_dual_mov_b32 v0, s78 :: v_dual_mov_b32 v1, s79 -; GFX1250-NEXT: v_dual_mov_b32 v2, s74 :: v_dual_mov_b32 v3, s75 -; GFX1250-NEXT: s_wait_xcnt 0x4 -; GFX1250-NEXT: v_mov_b32_e32 v4, s66 -; GFX1250-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v5, s67 :: v_dual_mov_b32 v6, s62 +; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[8:9] offset:176 +; GFX1250-NEXT: global_store_b128 v4, v[6:9], s[8:9] offset:160 +; GFX1250-NEXT: global_store_b128 v4, v[10:13], s[8:9] offset:144 +; GFX1250-NEXT: global_store_b128 v4, v[14:17], s[8:9] offset:128 +; GFX1250-NEXT: global_store_b128 v4, v[18:21], s[8:9] offset:112 +; GFX1250-NEXT: global_store_b128 v4, v[22:25], s[8:9] offset:96 ; GFX1250-NEXT: s_wait_xcnt 0x3 -; GFX1250-NEXT: v_dual_mov_b32 v7, s63 :: v_dual_mov_b32 v10, s56 -; GFX1250-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v11, s57 :: v_dual_mov_b32 v12, s52 +; GFX1250-NEXT: v_dual_mov_b32 v11, s73 :: v_dual_mov_b32 v12, s0 +; GFX1250-NEXT: v_readlane_b32 s0, v5, 4 +; GFX1250-NEXT: v_mov_b32_e32 v13, s1 +; GFX1250-NEXT: v_readlane_b32 s1, v5, 5 +; GFX1250-NEXT: v_dual_mov_b32 v0, s62 :: v_dual_mov_b32 v1, s63 ; GFX1250-NEXT: s_wait_xcnt 0x2 -; GFX1250-NEXT: v_dual_mov_b32 v13, s53 :: v_dual_mov_b32 v14, s44 -; GFX1250-NEXT: v_dual_mov_b32 v15, s45 :: v_dual_mov_b32 v16, s40 +; GFX1250-NEXT: v_dual_mov_b32 v2, s64 :: v_dual_mov_b32 v14, s0 +; GFX1250-NEXT: v_readlane_b32 s0, v5, 2 +; GFX1250-NEXT: v_mov_b32_e32 v15, s1 +; GFX1250-NEXT: v_readlane_b32 s1, v5, 3 +; GFX1250-NEXT: v_dual_mov_b32 v3, s65 :: v_dual_mov_b32 v6, s68 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mov_b32_e32 v16, s0 +; GFX1250-NEXT: v_readlane_b32 s0, v5, 0 +; GFX1250-NEXT: v_mov_b32_e32 v17, s1 +; GFX1250-NEXT: v_readlane_b32 s1, v5, 1 +; GFX1250-NEXT: v_dual_mov_b32 v7, s69 :: v_dual_mov_b32 v8, s70 ; GFX1250-NEXT: s_wait_xcnt 0x1 -; GFX1250-NEXT: v_dual_mov_b32 v17, s41 :: v_dual_mov_b32 v18, s30 -; GFX1250-NEXT: v_dual_mov_b32 v19, s31 :: v_dual_mov_b32 v20, s26 +; GFX1250-NEXT: v_mov_b32_e32 v18, s0 +; GFX1250-NEXT: v_readlane_b32 s0, v34, 30 +; GFX1250-NEXT: v_mov_b32_e32 v19, s1 +; GFX1250-NEXT: v_readlane_b32 s1, v34, 31 +; GFX1250-NEXT: v_dual_mov_b32 v9, s71 :: v_dual_mov_b32 v10, s72 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mov_b32_e32 v20, s0 +; GFX1250-NEXT: v_readlane_b32 s0, v34, 26 +; GFX1250-NEXT: v_mov_b32_e32 v21, s1 +; GFX1250-NEXT: v_readlane_b32 s1, v34, 27 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v21, s27 :: v_dual_mov_b32 v22, s18 -; GFX1250-NEXT: v_dual_mov_b32 v23, s19 :: v_dual_mov_b32 v24, s10 -; GFX1250-NEXT: v_mov_b32_e32 v25, s11 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_mov_b32_e32 v22, s0 +; GFX1250-NEXT: v_readlane_b32 s0, v34, 28 +; GFX1250-NEXT: v_mov_b32_e32 v23, s1 +; GFX1250-NEXT: v_readlane_b32 s1, v34, 29 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1 ; GFX1250-NEXT: s_clause 0x5 -; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:80 -; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:64 -; GFX1250-NEXT: global_store_b128 v8, v[10:13], s[0:1] offset:48 -; GFX1250-NEXT: global_store_b128 v8, v[14:17], s[0:1] offset:32 -; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:16 -; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] +; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[8:9] offset:80 +; GFX1250-NEXT: global_store_b128 v4, v[6:9], s[8:9] offset:64 +; GFX1250-NEXT: global_store_b128 v4, v[10:13], s[8:9] offset:48 +; GFX1250-NEXT: global_store_b128 v4, v[14:17], s[8:9] offset:32 +; GFX1250-NEXT: global_store_b128 v4, v[18:21], s[8:9] offset:16 +; GFX1250-NEXT: global_store_b128 v4, v[22:25], s[8:9] ; GFX1250-NEXT: s_endpgm %load = load <64 x i1>, ptr addrspace(4) %in %ext = sext <64 x i1> %load to <64 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index a135b43bad0fe..01c012f59c34a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -2612,27 +2612,29 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s14, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s1, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s0, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s3, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s0, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff @@ -2642,60 +2644,56 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: @@ -2717,14 +2715,8 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_lshr_b32 s25, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s26, s9, 16 ; GCN-HSA-NEXT: s_lshr_b32 s27, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s29, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s30, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s31, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s33, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s34, s14, 16 -; GCN-HSA-NEXT: s_and_b32 s35, s1, 0xffff -; GCN-HSA-NEXT: s_and_b32 s36, s0, 0xffff +; GCN-HSA-NEXT: s_and_b32 s28, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s29, s0, 0xffff ; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff @@ -2733,39 +2725,45 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s30, s11, 16 ; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s31, s10, 16 ; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s33, s13, 16 ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s34, s12, 16 ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s35, s15, 16 ; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s36, s14, 16 ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s31 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s30 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -2803,9 +2801,9 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -3096,90 +3094,88 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s1, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s19, s0, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s20, s1 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s21, s0 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s1, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s0, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s1, s1 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s0, s0 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s22, s3, 16 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s2, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s24, s3 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s25, s2 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s26, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s4, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s3, s3 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s2, s2 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s4, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s28, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s6, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s26, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s6, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s30, s9, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s8, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s28, s9, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s8, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s11, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s10, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s30, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s10, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s36, s12, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s13, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s12, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s15, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s14, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s15, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s36, s14, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32: @@ -3191,8 +3187,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i32 s18, s1, 16 -; GCN-HSA-NEXT: s_ashr_i32 s19, s0, 16 +; GCN-HSA-NEXT: s_ashr_i32 s20, s1, 16 +; GCN-HSA-NEXT: s_ashr_i32 s21, s0, 16 ; GCN-HSA-NEXT: s_ashr_i32 s22, s3, 16 ; GCN-HSA-NEXT: s_ashr_i32 s23, s2, 16 ; GCN-HSA-NEXT: s_ashr_i32 s24, s5, 16 @@ -3207,32 +3203,32 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_ashr_i32 s34, s12, 16 ; GCN-HSA-NEXT: s_ashr_i32 s35, s15, 16 ; GCN-HSA-NEXT: s_ashr_i32 s36, s14, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s21, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: s_sext_i32_i16 s20, s1 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 -; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 +; GCN-HSA-NEXT: s_add_u32 s18, s16, 0x70 ; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 +; GCN-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: s_sext_i32_i16 s14, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GCN-HSA-NEXT: s_sext_i32_i16 s14, s0 +; GCN-HSA-NEXT: s_sext_i32_i16 s0, s13 +; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 +; GCN-HSA-NEXT: s_sext_i32_i16 s12, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 @@ -3287,10 +3283,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -3587,31 +3583,89 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s39, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s38, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s37, s17 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[18:19], 0x10 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s1, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s0, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s2, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s41, s5, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s45, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s47, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s49, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s51, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s53, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s54, s14, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s39, s1, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s40, s0, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s43, s3, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s44, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s43, s7, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s44, s30, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s30, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s6, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s30 +; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s31, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s31, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s9, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s31 +; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s28, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s28, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s31 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s8, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s28 +; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s29, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s29, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s28 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s11, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s29 +; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s26, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s26, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s29 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s10, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s26 +; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s27, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s27, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s26 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s27 +; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s24, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s24, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s27 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s12, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s24 +; GCN-NOHSA-SI-NEXT: s_and_b32 s24, s25, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s25, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s15, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s25 +; GCN-NOHSA-SI-NEXT: s_and_b32 s25, s22, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s22, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s25 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s14, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s22 +; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s23, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s23, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s22 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s17, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s23 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s20, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s20, 0xffff +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:240 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s16, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: s_and_b32 s23, s21, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s21, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s19, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s18, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff @@ -3624,351 +3678,296 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s55, s17, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s16, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s57, s19, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s18, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s59, s21, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s20, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s61, s23, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s62, s22, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s63, s25, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s64, s24, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s65, s27, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s66, s26, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s67, s29, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s68, s28, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s69, s31, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s70, s30, 16 ; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s17, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s20, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s23, s23, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s25, s25, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s24, s24, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s27, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s26, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s29, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s28, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s31, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s30, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s21, 0xffff -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s70 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s56 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s52 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s44 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s45 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s42 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: +; GCN-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s20, s1, 16 -; GCN-HSA-NEXT: s_lshr_b32 s21, s0, 16 -; GCN-HSA-NEXT: s_lshr_b32 s22, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s23, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s24, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s30, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s33, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s35, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s37, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s39, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s42, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s44, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s45, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s46, s14, 16 -; GCN-HSA-NEXT: s_and_b32 s25, s1, 0xffff -; GCN-HSA-NEXT: s_and_b32 s27, s0, 0xffff -; GCN-HSA-NEXT: s_and_b32 s29, s3, 0xffff -; GCN-HSA-NEXT: s_and_b32 s31, s2, 0xffff -; GCN-HSA-NEXT: s_and_b32 s34, s5, 0xffff -; GCN-HSA-NEXT: s_and_b32 s36, s4, 0xffff -; GCN-HSA-NEXT: s_and_b32 s38, s7, 0xffff -; GCN-HSA-NEXT: s_and_b32 s40, s6, 0xffff -; GCN-HSA-NEXT: s_and_b32 s41, s9, 0xffff -; GCN-HSA-NEXT: s_and_b32 s43, s8, 0xffff -; GCN-HSA-NEXT: s_and_b32 s47, s11, 0xffff -; GCN-HSA-NEXT: s_and_b32 s48, s10, 0xffff -; GCN-HSA-NEXT: s_and_b32 s49, s13, 0xffff -; GCN-HSA-NEXT: s_and_b32 s51, s12, 0xffff -; GCN-HSA-NEXT: s_and_b32 s50, s15, 0xffff -; GCN-HSA-NEXT: s_and_b32 s52, s14, 0xffff -; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 +; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 +; GCN-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s18, s1, 16 -; GCN-HSA-NEXT: s_lshr_b32 s19, s0, 16 -; GCN-HSA-NEXT: s_lshr_b32 s53, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s54, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s55, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s56, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s57, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s58, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s59, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s60, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s61, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s62, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s63, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s64, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s65, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s66, s14, 16 -; GCN-HSA-NEXT: s_and_b32 s67, s1, 0xffff -; GCN-HSA-NEXT: s_and_b32 s68, s0, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s33, s1, 16 +; GCN-HSA-NEXT: s_lshr_b32 s34, s0, 16 +; GCN-HSA-NEXT: s_lshr_b32 s35, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s40, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s41, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s42, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s43, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s44, s6, 16 +; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s45, s9, 16 ; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s46, s8, 16 ; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s47, s11, 16 ; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s48, s10, 16 ; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s49, s13, 16 ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s50, s12, 16 ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s38, s15, 16 ; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s39, s14, 16 ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s61 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s60 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s59 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s57 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s42 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] +; GCN-HSA-NEXT: s_lshr_b32 s51, s17, 16 +; GCN-HSA-NEXT: s_and_b32 s52, s17, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s53, s16, 16 +; GCN-HSA-NEXT: s_and_b32 s54, s16, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s55, s19, 16 +; GCN-HSA-NEXT: s_and_b32 s19, s19, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s56, s18, 16 +; GCN-HSA-NEXT: s_and_b32 s18, s18, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s57, s21, 16 +; GCN-HSA-NEXT: s_and_b32 s21, s21, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s58, s20, 16 +; GCN-HSA-NEXT: s_and_b32 s20, s20, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s59, s23, 16 +; GCN-HSA-NEXT: s_and_b32 s23, s23, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s60, s22, 16 +; GCN-HSA-NEXT: s_and_b32 s22, s22, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s61, s25, 16 +; GCN-HSA-NEXT: s_and_b32 s25, s25, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s62, s24, 16 +; GCN-HSA-NEXT: s_and_b32 s24, s24, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s63, s27, 16 +; GCN-HSA-NEXT: s_and_b32 s27, s27, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s64, s26, 16 +; GCN-HSA-NEXT: s_and_b32 s26, s26, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s65, s29, 16 +; GCN-HSA-NEXT: s_and_b32 s29, s29, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s66, s28, 16 +; GCN-HSA-NEXT: s_and_b32 s28, s28, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s67, s31, 16 +; GCN-HSA-NEXT: s_and_b32 s31, s31, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s68, s30, 16 +; GCN-HSA-NEXT: s_and_b32 s30, s30, 0xffff +; GCN-HSA-NEXT: s_add_u32 s16, s36, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s17, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s16 +; GCN-HSA-NEXT: s_add_u32 s16, s36, 0xe0 +; GCN-HSA-NEXT: s_addc_u32 s17, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s16 +; GCN-HSA-NEXT: s_add_u32 s16, s36, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s17, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s16 +; GCN-HSA-NEXT: s_add_u32 s16, s36, 0xc0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s17, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s17 +; GCN-HSA-NEXT: s_add_u32 s16, s36, 0xb0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[4:7] +; GCN-HSA-NEXT: s_addc_u32 s17, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s17 +; GCN-HSA-NEXT: s_add_u32 s16, s36, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 +; GCN-HSA-NEXT: flat_store_dwordx4 v[29:30], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[1:2], v[12:15] +; GCN-HSA-NEXT: s_addc_u32 s17, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s17 +; GCN-HSA-NEXT: s_add_u32 s16, s36, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s17, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GCN-HSA-NEXT: s_add_u32 s14, s36, 0x80 +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s15 +; GCN-HSA-NEXT: s_addc_u32 s15, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s15 +; GCN-HSA-NEXT: s_add_u32 s14, s36, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s16 +; GCN-HSA-NEXT: s_addc_u32 s15, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 +; GCN-HSA-NEXT: s_add_u32 s12, s36, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[24:27] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s13 +; GCN-HSA-NEXT: s_addc_u32 s13, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s36, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: s_addc_u32 s11, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s36, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GCN-HSA-NEXT: s_addc_u32 s9, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s36, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: s_addc_u32 s7, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s36, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: s_addc_u32 s5, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s36, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s37 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -3976,34 +3975,57 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s17, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s16, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s19, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s18, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s21, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s20, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s23, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s22, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s25, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s24, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s27, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s26, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s29, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s28, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s31, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s30, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s1, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s0, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s3, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s2, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s1, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s0, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s3, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s2, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s10, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s13, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s12, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s15, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s14, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s17, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s16, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s19, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s18, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s21, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s20, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s23, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s22, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s25, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s24, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s27, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s26, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s29, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s28, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s31, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s30, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s10, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, 0xffff @@ -4018,170 +4040,151 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s29, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s31, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s10, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s13, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s12, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s15, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s14, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s1, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s0, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s15, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s14, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xf0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xe0 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s31, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s30, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xf0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xe0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xd0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xd0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xc0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xc0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xb0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xb0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0xa0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xa0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x90 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x90 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x80 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x70 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x70 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x60 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 0x50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s38 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x50 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 32 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s36, 48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s36, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s36, 32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s36, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s37 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -4506,388 +4509,387 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x10 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s39, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s38, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s37, s1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x10 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s17, 16 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s16, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s17, s17 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s16, s16 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s19, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s18, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s18, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s19, s19 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s18, s18 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s21, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s20, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s21, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s42, s20, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s21, s21 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s20, s20 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s23, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s42, s22, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s43, s14 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s23, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s43 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s22, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s23, s23 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s14, s14, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s22, s22 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s25, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s24, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s14 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s15 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s15, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s25, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s14 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s14, s24, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s25, s25 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s12, s12, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s24, s24 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s27, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s26, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s12 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s13 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s13, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s27, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s12 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s12, s26, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s27, s27 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s10, s10, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s26, s26 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s29, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s48, s28, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s10 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s11 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s11, s11, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s11, s29, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s10 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s10, s28, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s29, s29 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s8, s8, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s28, s28 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s49, s31, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s50, s30, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s8 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s9 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s9, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s8 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s9 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s31, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s8 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s8, s30, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s31, s31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s6, s6, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s30, s30 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s51, s1, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s52, s0, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s53, s1 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s54, s0 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s3, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s2, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s57, s3 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s58, s2 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s4, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s6, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s62, s7 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s9, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s64, s8, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s11, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s10, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s68, s12, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s69, s15, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s70, s14, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s6 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s7 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s7, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s70 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s62 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s6 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s6, s1, 16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s4 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s57 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s0, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s1, s1 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s4, s4, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s0, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s5 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s5, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s4, s3, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s2, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s3, s3 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s2, s2 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:160 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s53 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s45 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s14 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s44 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s42 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s40 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: s_endpgm -; -; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32: -; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i32 s20, s1, 16 -; GCN-HSA-NEXT: s_ashr_i32 s21, s0, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s22, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s23, s0 -; GCN-HSA-NEXT: s_ashr_i32 s24, s3, 16 -; GCN-HSA-NEXT: s_ashr_i32 s25, s2, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s26, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s27, s2 -; GCN-HSA-NEXT: s_ashr_i32 s28, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s29, s4, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s30, s5 -; GCN-HSA-NEXT: s_sext_i32_i16 s31, s4 -; GCN-HSA-NEXT: s_ashr_i32 s33, s7, 16 -; GCN-HSA-NEXT: s_ashr_i32 s34, s6, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s35, s7 -; GCN-HSA-NEXT: s_sext_i32_i16 s36, s6 -; GCN-HSA-NEXT: s_ashr_i32 s37, s9, 16 -; GCN-HSA-NEXT: s_ashr_i32 s38, s8, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s39, s9 -; GCN-HSA-NEXT: s_sext_i32_i16 s40, s8 -; GCN-HSA-NEXT: s_ashr_i32 s41, s11, 16 -; GCN-HSA-NEXT: s_ashr_i32 s42, s10, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s43, s11 -; GCN-HSA-NEXT: s_sext_i32_i16 s44, s10 -; GCN-HSA-NEXT: s_ashr_i32 s45, s13, 16 -; GCN-HSA-NEXT: s_ashr_i32 s47, s12, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s46, s13 -; GCN-HSA-NEXT: s_sext_i32_i16 s49, s12 -; GCN-HSA-NEXT: s_ashr_i32 s48, s15, 16 -; GCN-HSA-NEXT: s_ashr_i32 s50, s14, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s51, s15 -; GCN-HSA-NEXT: s_sext_i32_i16 s52, s14 -; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 -; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i32 s18, s1, 16 -; GCN-HSA-NEXT: s_ashr_i32 s19, s0, 16 -; GCN-HSA-NEXT: s_ashr_i32 s53, s3, 16 -; GCN-HSA-NEXT: s_ashr_i32 s54, s2, 16 -; GCN-HSA-NEXT: s_ashr_i32 s57, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s58, s4, 16 -; GCN-HSA-NEXT: s_ashr_i32 s59, s7, 16 -; GCN-HSA-NEXT: s_ashr_i32 s60, s6, 16 -; GCN-HSA-NEXT: s_ashr_i32 s61, s9, 16 -; GCN-HSA-NEXT: s_ashr_i32 s62, s8, 16 -; GCN-HSA-NEXT: s_ashr_i32 s63, s11, 16 -; GCN-HSA-NEXT: s_ashr_i32 s64, s10, 16 -; GCN-HSA-NEXT: s_ashr_i32 s65, s13, 16 -; GCN-HSA-NEXT: s_ashr_i32 s66, s12, 16 -; GCN-HSA-NEXT: s_ashr_i32 s67, s15, 16 -; GCN-HSA-NEXT: s_ashr_i32 s68, s14, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s56, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xf0 -; GCN-HSA-NEXT: s_sext_i32_i16 s55, s3 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xe0 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 -; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] -; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] -; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80 -; GCN-HSA-NEXT: s_sext_i32_i16 s0, s0 -; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 -; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: s_sext_i32_i16 s1, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 -; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 +; GCN-NOHSA-SI-NEXT: s_endpgm +; +; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32: +; GCN-HSA: ; %bb.0: +; GCN-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 +; GCN-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 +; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_ashr_i32 s33, s1, 16 +; GCN-HSA-NEXT: s_ashr_i32 s34, s0, 16 +; GCN-HSA-NEXT: s_ashr_i32 s35, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s40, s2, 16 +; GCN-HSA-NEXT: s_ashr_i32 s41, s5, 16 +; GCN-HSA-NEXT: s_ashr_i32 s42, s4, 16 +; GCN-HSA-NEXT: s_ashr_i32 s43, s7, 16 +; GCN-HSA-NEXT: s_ashr_i32 s44, s6, 16 +; GCN-HSA-NEXT: s_ashr_i32 s45, s9, 16 +; GCN-HSA-NEXT: s_ashr_i32 s46, s8, 16 +; GCN-HSA-NEXT: s_ashr_i32 s47, s11, 16 +; GCN-HSA-NEXT: s_ashr_i32 s48, s10, 16 +; GCN-HSA-NEXT: s_ashr_i32 s38, s13, 16 +; GCN-HSA-NEXT: s_ashr_i32 s39, s12, 16 +; GCN-HSA-NEXT: s_ashr_i32 s49, s15, 16 +; GCN-HSA-NEXT: s_ashr_i32 s50, s14, 16 +; GCN-HSA-NEXT: s_ashr_i32 s51, s17, 16 +; GCN-HSA-NEXT: s_ashr_i32 s52, s16, 16 +; GCN-HSA-NEXT: s_ashr_i32 s53, s19, 16 +; GCN-HSA-NEXT: s_ashr_i32 s56, s18, 16 +; GCN-HSA-NEXT: s_ashr_i32 s57, s21, 16 +; GCN-HSA-NEXT: s_ashr_i32 s58, s20, 16 +; GCN-HSA-NEXT: s_ashr_i32 s59, s23, 16 +; GCN-HSA-NEXT: s_ashr_i32 s60, s22, 16 +; GCN-HSA-NEXT: s_ashr_i32 s61, s25, 16 +; GCN-HSA-NEXT: s_ashr_i32 s62, s24, 16 +; GCN-HSA-NEXT: s_ashr_i32 s63, s27, 16 +; GCN-HSA-NEXT: s_ashr_i32 s64, s26, 16 +; GCN-HSA-NEXT: s_ashr_i32 s65, s29, 16 +; GCN-HSA-NEXT: s_ashr_i32 s66, s28, 16 +; GCN-HSA-NEXT: s_ashr_i32 s67, s31, 16 +; GCN-HSA-NEXT: s_ashr_i32 s68, s30, 16 +; GCN-HSA-NEXT: s_add_u32 s54, s36, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s55, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s55 +; GCN-HSA-NEXT: s_add_u32 s54, s36, 0xe0 +; GCN-HSA-NEXT: s_sext_i32_i16 s20, s20 +; GCN-HSA-NEXT: s_addc_u32 s55, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s20 +; GCN-HSA-NEXT: s_sext_i32_i16 s20, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s20 +; GCN-HSA-NEXT: s_add_u32 s20, s36, 0xd0 +; GCN-HSA-NEXT: s_sext_i32_i16 s30, s30 +; GCN-HSA-NEXT: s_addc_u32 s21, s37, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s18, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s30 +; GCN-HSA-NEXT: s_sext_i32_i16 s30, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s21 +; GCN-HSA-NEXT: s_add_u32 s18, s36, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s20 +; GCN-HSA-NEXT: s_sext_i32_i16 s20, s19 +; GCN-HSA-NEXT: s_addc_u32 s19, s37, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s28, s28 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[1:4] +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s28 +; GCN-HSA-NEXT: s_sext_i32_i16 s28, s29 +; GCN-HSA-NEXT: s_sext_i32_i16 s26, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s18 +; GCN-HSA-NEXT: s_sext_i32_i16 s16, s16 +; GCN-HSA-NEXT: s_add_u32 s18, s36, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s26 +; GCN-HSA-NEXT: s_sext_i32_i16 s26, s27 +; GCN-HSA-NEXT: s_sext_i32_i16 s24, s24 +; GCN-HSA-NEXT: s_addc_u32 s19, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: s_sext_i32_i16 s16, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s24 +; GCN-HSA-NEXT: s_sext_i32_i16 s24, s25 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[5:8] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16 +; GCN-HSA-NEXT: s_add_u32 s16, s36, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s61 +; GCN-HSA-NEXT: flat_store_dwordx4 v[29:30], v[9:12] +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[13:16] +; GCN-HSA-NEXT: s_addc_u32 s17, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s17 +; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 +; GCN-HSA-NEXT: s_add_u32 s16, s36, 0x90 +; GCN-HSA-NEXT: s_sext_i32_i16 s22, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14 +; GCN-HSA-NEXT: s_addc_u32 s17, s37, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s14, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s22 +; GCN-HSA-NEXT: s_sext_i32_i16 s22, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14 +; GCN-HSA-NEXT: s_add_u32 s14, s36, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s18 +; GCN-HSA-NEXT: s_addc_u32 s15, s37, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s14 +; GCN-HSA-NEXT: s_add_u32 s14, s36, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s15, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s14 +; GCN-HSA-NEXT: s_sext_i32_i16 s14, s0 +; GCN-HSA-NEXT: s_sext_i32_i16 s0, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s49 +; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s36, 0x60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[21:24] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 +; GCN-HSA-NEXT: s_sext_i32_i16 s12, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: s_add_u32 s0, s36, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s1, s37, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s39 +; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 +; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s36, 64 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s47 +; GCN-HSA-NEXT: s_addc_u32 s1, s37, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 +; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s36, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 +; GCN-HSA-NEXT: s_addc_u32 s1, s37, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 +; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s36, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s43 +; GCN-HSA-NEXT: s_addc_u32 s1, s37, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s36, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GCN-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s37 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -7249,103 +7251,106 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s10, s7 -; GCN-HSA-NEXT: s_lshr_b32 s12, s6, 16 -; GCN-HSA-NEXT: s_mov_b32 s14, s5 -; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 16 +; GCN-HSA-NEXT: s_mov_b32 s16, s7 +; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 16 +; GCN-HSA-NEXT: s_mov_b32 s20, s5 +; GCN-HSA-NEXT: s_lshr_b32 s22, s4, 16 +; GCN-HSA-NEXT: s_mov_b32 s24, s3 +; GCN-HSA-NEXT: s_lshr_b32 s26, s2, 16 +; GCN-HSA-NEXT: s_mov_b32 s28, s1 +; GCN-HSA-NEXT: s_lshr_b32 s12, s0, 16 ; GCN-HSA-NEXT: s_ashr_i32 s25, s1, 31 -; GCN-HSA-NEXT: s_ashr_i32 s29, s3, 31 -; GCN-HSA-NEXT: s_ashr_i32 s30, s3, 16 -; GCN-HSA-NEXT: s_mov_b32 s18, s3 -; GCN-HSA-NEXT: s_lshr_b32 s20, s2, 16 -; GCN-HSA-NEXT: s_mov_b32 s22, s1 -; GCN-HSA-NEXT: s_lshr_b32 s24, s0, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[2:3], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[10:11], 0x100000 -; GCN-HSA-NEXT: s_ashr_i32 s28, s1, 16 -; GCN-HSA-NEXT: s_ashr_i32 s31, s5, 31 -; GCN-HSA-NEXT: s_ashr_i32 s33, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s34, s7, 31 -; GCN-HSA-NEXT: s_ashr_i32 s35, s7, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[22:23], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s27, s1, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[28:29], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s28, s3, 31 +; GCN-HSA-NEXT: s_ashr_i32 s29, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s33, s5, 31 +; GCN-HSA-NEXT: s_ashr_i32 s36, s5, 16 +; GCN-HSA-NEXT: s_ashr_i32 s37, s7, 31 +; GCN-HSA-NEXT: s_ashr_i32 s38, s7, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[24:25], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[22:23], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 ; GCN-HSA-NEXT: s_add_u32 s22, s8, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s23, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s34 -; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GCN-HSA-NEXT: s_add_u32 s16, s8, 0x60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s17, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GCN-HSA-NEXT: s_add_u32 s16, s8, 0x50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: s_addc_u32 s17, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s8, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s8, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s8, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s8, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s8, 16 +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s8, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s8, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -7618,108 +7623,106 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s0, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s0, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s0, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s1, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: @@ -7733,25 +7736,19 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s20, s1, 16 -; GCN-HSA-NEXT: s_lshr_b32 s21, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s22, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s23, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s24, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s25, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s27, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s29, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s30, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s31, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s33, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s34, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s21, s1, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s23, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s25, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s4, 16 ; GCN-HSA-NEXT: s_lshr_b32 s19, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s18, s0, 16 ; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-HSA-NEXT: s_and_b32 s35, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s28, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff @@ -7759,11 +7756,17 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff ; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-HSA-NEXT: s_and_b32 s36, s5, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s29, s5, 16 +; GCN-HSA-NEXT: s_and_b32 s30, s5, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s31, s7, 16 ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s33, s9, 16 ; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s34, s11, 16 ; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s35, s13, 16 ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s36, s15, 16 ; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 @@ -7776,91 +7779,91 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x70 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s16, 48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s16, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xe0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xc0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0xa0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x80 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s16, 0x60 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s16, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8309,154 +8312,152 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s9 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s1, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s3, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s5, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s9, 31 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[22:23], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s9, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s11, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s11, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s13, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s15, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s8, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s7 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s6, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s4, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s3 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s2, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s0, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s55 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s56, s15 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s14, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s58, s13 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s12, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s54, s11 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s10, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s9 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s8, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s52, s7 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s6, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s60, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s4, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s62, s3 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s2, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s64, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s0, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[0:1], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[2:3], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s56 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s0, s15, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s57 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s1, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s0, s15, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s1, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s0 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[64:65], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s56 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s3, 31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s57 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[58:59], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s3, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s2, s13, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[62:63], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s2 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s2, s13, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s5, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s2 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[60:61], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s66 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s49, s7, 31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s67 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[54:55], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s54, s7, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s2 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s2, s11, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s3 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s2 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s9, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s2, s11, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s10, s9, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s2 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[46:47], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s50 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[48:49], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s51 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[44:45], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s8 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s9 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[16:19], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s10 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s29 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s39 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[16:19], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[16:19], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[16:19], 0 offset:144 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s52 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s53 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s49 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s57 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[46:47], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[44:45], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s49 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[16:19], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s47 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s39 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s35 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s31 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s29 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s41 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[16:19], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s9 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[36:37], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[34:35], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[28:29], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s41 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[16:19], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[16:19], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s31 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s35 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[16:19], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[16:19], 0 offset:96 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[16:19], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[16:19], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, s3 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[16:19], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: @@ -8468,152 +8469,152 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s24, s15 -; GCN-HSA-NEXT: s_ashr_i32 s37, s3, 31 -; GCN-HSA-NEXT: s_ashr_i32 s38, s3, 16 -; GCN-HSA-NEXT: s_ashr_i32 s57, s11, 16 -; GCN-HSA-NEXT: s_ashr_i32 s59, s13, 31 -; GCN-HSA-NEXT: s_ashr_i32 s61, s13, 16 -; GCN-HSA-NEXT: s_ashr_i32 s63, s15, 31 -; GCN-HSA-NEXT: s_ashr_i32 s65, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s46, s14, 16 -; GCN-HSA-NEXT: s_mov_b32 s48, s13 -; GCN-HSA-NEXT: s_lshr_b32 s50, s12, 16 -; GCN-HSA-NEXT: s_mov_b32 s52, s11 -; GCN-HSA-NEXT: s_lshr_b32 s34, s10, 16 -; GCN-HSA-NEXT: s_mov_b32 s30, s9 -; GCN-HSA-NEXT: s_lshr_b32 s28, s8, 16 -; GCN-HSA-NEXT: s_mov_b32 s54, s7 -; GCN-HSA-NEXT: s_lshr_b32 s56, s6, 16 -; GCN-HSA-NEXT: s_mov_b32 s58, s5 -; GCN-HSA-NEXT: s_lshr_b32 s60, s4, 16 -; GCN-HSA-NEXT: s_mov_b32 s62, s3 -; GCN-HSA-NEXT: s_lshr_b32 s64, s2, 16 -; GCN-HSA-NEXT: s_mov_b32 s66, s1 -; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x100000 +; GCN-HSA-NEXT: s_mov_b32 s26, s5 +; GCN-HSA-NEXT: s_lshr_b32 s28, s4, 16 +; GCN-HSA-NEXT: s_mov_b32 s34, s3 +; GCN-HSA-NEXT: s_lshr_b32 s44, s2, 16 +; GCN-HSA-NEXT: s_mov_b32 s46, s1 +; GCN-HSA-NEXT: s_lshr_b32 s48, s0, 16 +; GCN-HSA-NEXT: s_mov_b32 s50, s7 +; GCN-HSA-NEXT: s_lshr_b32 s52, s6, 16 +; GCN-HSA-NEXT: s_mov_b32 s54, s9 +; GCN-HSA-NEXT: s_lshr_b32 s56, s8, 16 +; GCN-HSA-NEXT: s_ashr_i32 s59, s9, 16 +; GCN-HSA-NEXT: s_mov_b32 s58, s11 +; GCN-HSA-NEXT: s_lshr_b32 s60, s10, 16 +; GCN-HSA-NEXT: s_ashr_i32 s61, s11, 31 +; GCN-HSA-NEXT: s_ashr_i32 s63, s11, 16 +; GCN-HSA-NEXT: s_mov_b32 s62, s13 +; GCN-HSA-NEXT: s_lshr_b32 s64, s12, 16 +; GCN-HSA-NEXT: s_ashr_i32 s65, s13, 31 +; GCN-HSA-NEXT: s_ashr_i32 s69, s13, 16 +; GCN-HSA-NEXT: s_mov_b32 s68, s15 +; GCN-HSA-NEXT: s_lshr_b32 s70, s14, 16 +; GCN-HSA-NEXT: s_ashr_i32 s71, s15, 31 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-HSA-NEXT: s_ashr_i32 s33, s1, 31 ; GCN-HSA-NEXT: s_ashr_i32 s36, s1, 16 +; GCN-HSA-NEXT: s_ashr_i32 s37, s3, 31 +; GCN-HSA-NEXT: s_ashr_i32 s38, s3, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_ashr_i32 s39, s5, 31 ; GCN-HSA-NEXT: s_ashr_i32 s40, s5, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000 ; GCN-HSA-NEXT: s_ashr_i32 s41, s7, 31 ; GCN-HSA-NEXT: s_ashr_i32 s42, s7, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x100000 ; GCN-HSA-NEXT: s_ashr_i32 s43, s9, 31 -; GCN-HSA-NEXT: s_ashr_i32 s44, s9, 16 -; GCN-HSA-NEXT: s_ashr_i32 s45, s11, 31 -; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[4:5], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[70:71], s[10:11], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[72:73], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[74:75], s[14:15], 0x100000 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[54:55], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s54, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s55, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s47 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s47 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xb0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s63 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s74 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GCN-HSA-NEXT: s_add_u32 s30, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s75 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[66:67], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[72:73], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_ashr_i32 s74, s15, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[48:49], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[46:47], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[34:35], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[52:53], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[50:51], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[56:57], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[54:55], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[60:61], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[58:59], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[64:65], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[62:63], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[70:71], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[68:69], 0x100000 +; GCN-HSA-NEXT: s_add_u32 s56, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s57 +; GCN-HSA-NEXT: s_add_u32 s56, s16, 0xe0 +; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s30 +; GCN-HSA-NEXT: s_add_u32 s30, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s31 ; GCN-HSA-NEXT: s_addc_u32 s31, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26 -; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 -; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 -; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s61 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s59 -; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s72 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s74 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s71 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s31 +; GCN-HSA-NEXT: s_add_u32 s30, s16, 0xc0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s31, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GCN-HSA-NEXT: s_add_u32 s30, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s31, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s69 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GCN-HSA-NEXT: s_add_u32 s24, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49 +; GCN-HSA-NEXT: flat_store_dwordx4 v[29:30], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[1:2], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: s_addc_u32 s25, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 -; GCN-HSA-NEXT: s_add_u32 s24, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s73 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 +; GCN-HSA-NEXT: s_add_u32 s24, s16, 0x90 ; GCN-HSA-NEXT: s_addc_u32 s25, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s46 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s25 +; GCN-HSA-NEXT: s_add_u32 s24, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s72 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s73 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s53 +; GCN-HSA-NEXT: s_addc_u32 s25, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s30 +; GCN-HSA-NEXT: s_add_u32 s24, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s31 +; GCN-HSA-NEXT: s_addc_u32 s25, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26 +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s25 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[24:27] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14 ; GCN-HSA-NEXT: s_add_u32 s14, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15 ; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s70 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s71 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s29 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s14 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: s_add_u32 s12, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 @@ -8624,50 +8625,47 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: s_add_u32 s10, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: s_add_u32 s8, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCN-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -8678,211 +8676,208 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s0, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s1, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s2, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s34, s3 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s3, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s4, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s5 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s6, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s7 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s8, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s54, s9 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s9, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s60, s11 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s12, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s68, s13 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s70, s13, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s74, s14, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s76, s15 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s78, s15, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, s1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s1, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[20:21], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[26:27], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, s3 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s5 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s9 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s11 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s13 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, s15 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[72:73], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[22:23], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[30:31], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[36:37], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[40:41], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[42:43], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[46:47], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[48:49], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[50:51], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[54:55], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[56:57], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[58:59], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[60:61], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[62:63], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[66:67], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[68:69], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[70:71], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[74:75], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[76:77], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[78:79], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s5, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s9, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s11, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s13, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[66:67], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s15, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-VI-NEXT: s_add_u32 s14, s16, 0xf0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s15, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s66 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s67 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s15 +; GCN-NOHSA-VI-NEXT: s_add_u32 s14, s16, 0xe0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_addc_u32 s15, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s62 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s15 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0xd0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s60 -; GCN-NOHSA-VI-NEXT: s_add_u32 s60, s16, 0xf0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s61, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s62 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0xc0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58 -; GCN-NOHSA-VI-NEXT: s_add_u32 s58, s16, 0xe0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s59, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s58 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s72 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s73 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s59 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s16, 0xb0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NOHSA-VI-NEXT: s_add_u32 s54, s16, 0xd0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s55, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s16, 0xa0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 -; GCN-NOHSA-VI-NEXT: s_add_u32 s52, s16, 0xc0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s53, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s52 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s65 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s16, 0x90 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 -; GCN-NOHSA-VI-NEXT: s_add_u32 s48, s16, 0xb0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s49, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s49 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 -; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 0xa0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s39 -; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 0x90 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s16, 0x80 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s39 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-VI-NEXT: s_add_u32 s24, s16, 0x80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s16, 0x70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s16, 0x60 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s40 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s25 -; GCN-NOHSA-VI-NEXT: s_add_u32 s24, s16, 0x70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 64 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: s_add_u32 s20, s16, 0x60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s21, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NOHSA-VI-NEXT: s_add_u32 s20, s16, 0x50 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s21, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 32 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s17 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 6f7ee70812264..d54db611c923b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -3093,39 +3093,36 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s9 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v28, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v30, s1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s35 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s34 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s33 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:96 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s3 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s30 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:80 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s29 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s28 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[16:19], 0 offset:64 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s27 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s26 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[16:19], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s24 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s22 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s20 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s24 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[16:19], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s22 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[16:19], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v29, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v31, s20 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[16:19], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64: @@ -3155,28 +3152,44 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_ashr_i32 s36, s14, 31 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x60 -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x50 -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: s_add_u32 s14, s16, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s33 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[13:14], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX7-HSA-NEXT: s_add_u32 s12, s16, 0x60 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s13 +; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-HSA-NEXT: s_add_u32 s10, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX7-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[15:16], v[3:6] ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 @@ -3184,7 +3197,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s18 +; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 @@ -3194,7 +3207,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s19 +; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 @@ -3203,25 +3216,13 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s14 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[17:18], v[6:9] -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[9:12] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -3967,324 +3968,328 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GFX6-NOHSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX6-NOHSA: ; %bb.0: -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x10 +; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s39, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s38, -1 -; GFX6-NOHSA-NEXT: s_mov_b32 s36, s0 -; GFX6-NOHSA-NEXT: s_mov_b32 s37, s1 -; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GFX6-NOHSA-NEXT: s_mov_b32 s36, s16 +; GFX6-NOHSA-NEXT: s_mov_b32 s37, s17 +; GFX6-NOHSA-NEXT: s_load_dwordx16 s[16:31], s[18:19], 0x10 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s17, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s34, s16, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s19, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s40, s18, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s41, s21, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s42, s20, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s30, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s31, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s28, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s44 -; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s29, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s43 -; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s23, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s44 -; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s22, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s1, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s34, s0, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s3, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s40, s2, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s41, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s42, s4, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s43, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s6, 31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX6-NOHSA-NEXT: s_ashr_i32 s30, s30, 31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s31, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s30 +; GFX6-NOHSA-NEXT: s_ashr_i32 s30, s9, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s8, 31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s28 +; GFX6-NOHSA-NEXT: s_ashr_i32 s28, s28, 31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s22 +; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s29, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s28 +; GFX6-NOHSA-NEXT: s_ashr_i32 s28, s11, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s29 +; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s10, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s13 +; GFX6-NOHSA-NEXT: s_ashr_i32 s13, s13, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s12 +; GFX6-NOHSA-NEXT: s_ashr_i32 s12, s12, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s15 +; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s15, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s14 +; GFX6-NOHSA-NEXT: s_ashr_i32 s14, s14, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s17 +; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s17, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s16 +; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s16, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s19 +; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s19, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s18 +; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s18, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s21 +; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s21, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s20 +; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s20, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s23 +; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s23, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s22 +; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s22, 31 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:240 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s25 +; GFX6-NOHSA-NEXT: s_ashr_i32 s25, s25, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX6-NOHSA-NEXT: s_ashr_i32 s24, s24, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v28, s27 +; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s27, 31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s26 +; GFX6-NOHSA-NEXT: s_ashr_i32 s26, s26, 31 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:224 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s16 -; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s25, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s27, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s26, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s24, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s17 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s18 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:208 -; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s1, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s0, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s3, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s2, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s4, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s24, s7, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s25, s6, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s26, s9, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s8, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s28, s11, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s10, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s30, s13, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s12, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s22 -; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s15, 31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s16 -; GFX6-NOHSA-NEXT: s_ashr_i32 s16, s14, 31 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:192 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s43 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:176 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v29, s27 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[26:29], off, s[36:39], 0 offset:208 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:144 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v29, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:192 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s33 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[36:39], 0 offset:128 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s22 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s23 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[23:26], off, s[36:39], 0 offset:176 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s30 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[36:39], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s3 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s28 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[17:20], off, s[36:39], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s26 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s24 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[36:39], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[36:39], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s17 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[36:39], 0 offset:128 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s14 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s15 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[11:14], off, s[36:39], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s12 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s13 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[36:39], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s28 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v28, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v30, s30 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[27:30], off, s[36:39], 0 offset:64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s42 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s41 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[36:39], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s40 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s35 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[21:24], off, s[36:39], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s34 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s33 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[36:39], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 -; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_ashr_i32 s20, s1, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s21, s0, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s22, s3, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s23, s2, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s24, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s25, s4, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s28, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s29, s6, 31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: s_ashr_i32 s36, s9, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s37, s8, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s38, s11, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s39, s10, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s40, s13, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s41, s12, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s42, s15, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s43, s14, 31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s1 -; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s40 +; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_ashr_i32 s18, s1, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s19, s0, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s26, s3, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s27, s2, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s30, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s31, s4, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s33, s7, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s33, s1, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s34, s0, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s35, s3, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s40, s2, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s41, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s42, s4, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s43, s7, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s44, s6, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s45, s9, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s46, s8, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s47, s11, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s48, s10, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s49, s13, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s50, s12, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s51, s15, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s38, s13, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s39, s12, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s49, s15, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s52, s14, 31 -; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s35 -; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xe0 -; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s35 -; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v36, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s34 -; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xc0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[31:32], v[27:30] -; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s35 -; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xb0 -; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s35 -; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0xa0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[33:34], v[23:26] -; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s35 -; GFX7-HSA-NEXT: s_add_u32 s34, s16, 0x90 -; GFX7-HSA-NEXT: s_addc_u32 s35, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 -; GFX7-HSA-NEXT: s_add_u32 s28, s16, 0x80 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: s_addc_u32 s29, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s28 +; GFX7-HSA-NEXT: s_ashr_i32 s53, s17, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s54, s16, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s55, s19, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s56, s18, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s57, s21, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s58, s20, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s59, s23, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s60, s22, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s61, s25, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s62, s24, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s63, s27, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s64, s26, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s65, s29, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s66, s28, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s67, s31, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s68, s30, 31 +; GFX7-HSA-NEXT: s_add_u32 s50, s36, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s51, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s51 +; GFX7-HSA-NEXT: s_add_u32 s50, s36, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s51, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s20 +; GFX7-HSA-NEXT: s_add_u32 s20, s36, 0xd0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s21 +; GFX7-HSA-NEXT: s_addc_u32 s21, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s20 +; GFX7-HSA-NEXT: s_add_u32 s20, s36, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s21, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s68 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s67 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s18 +; GFX7-HSA-NEXT: s_add_u32 s18, s36, 0xb0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-HSA-NEXT: s_addc_u32 s19, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-HSA-NEXT: s_add_u32 s16, s36, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s61 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[8:11] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[1:2], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s17 +; GFX7-HSA-NEXT: s_addc_u32 s17, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s66 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s29 -; GFX7-HSA-NEXT: s_add_u32 s28, s16, 0x70 -; GFX7-HSA-NEXT: s_addc_u32 s29, s17, 0 -; GFX7-HSA-NEXT: s_add_u32 s24, s16, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s25 -; GFX7-HSA-NEXT: s_addc_u32 s25, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s65 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s17 +; GFX7-HSA-NEXT: s_add_u32 s16, s36, 0x90 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[4:7] +; GFX7-HSA-NEXT: s_addc_u32 s17, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX7-HSA-NEXT: s_add_u32 s14, s36, 0x80 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[16:19] +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s15 +; GFX7-HSA-NEXT: s_addc_u32 s15, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s15 +; GFX7-HSA-NEXT: s_add_u32 s14, s36, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s16 +; GFX7-HSA-NEXT: s_addc_u32 s15, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s17 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[20:23] +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s14 +; GFX7-HSA-NEXT: s_add_u32 s12, s36, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s15 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[24:27] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s13 +; GFX7-HSA-NEXT: s_addc_u32 s13, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s34 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[35:36], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28 -; GFX7-HSA-NEXT: s_add_u32 s24, s16, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s29 -; GFX7-HSA-NEXT: s_addc_u32 s25, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s21 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[16:19] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[20:23] -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s49 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX7-HSA-NEXT: s_add_u32 s14, s16, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s47 -; GFX7-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[3:6] -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[6:9] -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-HSA-NEXT: s_add_u32 s10, s36, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX7-HSA-NEXT: s_addc_u32 s11, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s36, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-HSA-NEXT: s_addc_u32 s9, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 48 +; GFX7-HSA-NEXT: s_add_u32 s6, s36, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s37, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 32 +; GFX7-HSA-NEXT: s_add_u32 s4, s36, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GFX7-HSA-NEXT: s_addc_u32 s5, s37, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 +; GFX7-HSA-NEXT: s_add_u32 s2, s36, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GFX7-HSA-NEXT: s_addc_u32 s3, s37, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -4657,37 +4662,36 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX9-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 ; GFX9-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-HSA-NEXT: s_ashr_i32 s58, s30, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s58 -; GFX9-HSA-NEXT: s_ashr_i32 s58, s31, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s58 -; GFX9-HSA-NEXT: s_ashr_i32 s58, s28, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v5, s58 -; GFX9-HSA-NEXT: s_ashr_i32 s58, s29, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s58 -; GFX9-HSA-NEXT: s_ashr_i32 s58, s26, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v8, s58 -; GFX9-HSA-NEXT: s_ashr_i32 s58, s27, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v10, s58 -; GFX9-HSA-NEXT: s_ashr_i32 s58, s24, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v11, s58 -; GFX9-HSA-NEXT: s_ashr_i32 s58, s25, 31 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s30 +; GFX9-HSA-NEXT: s_ashr_i32 s30, s30, 31 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s31 +; GFX9-HSA-NEXT: s_ashr_i32 s31, s31, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GFX9-HSA-NEXT: s_ashr_i32 s30, s27, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s31 +; GFX9-HSA-NEXT: s_ashr_i32 s31, s26, 31 +; GFX9-HSA-NEXT: s_ashr_i32 s59, s25, 31 +; GFX9-HSA-NEXT: s_ashr_i32 s60, s24, 31 +; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:240 ; GFX9-HSA-NEXT: s_ashr_i32 s57, s23, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v13, s58 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s26 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s30 ; GFX9-HSA-NEXT: s_ashr_i32 s58, s22, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:240 -; GFX9-HSA-NEXT: v_mov_b32_e32 v6, s29 -; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s28 +; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:208 ; GFX9-HSA-NEXT: s_ashr_i32 s55, s21, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s60 +; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s59 ; GFX9-HSA-NEXT: s_ashr_i32 s56, s20, 31 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[4:7], s[36:37] offset:224 +; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:192 +; GFX9-HSA-NEXT: s_ashr_i32 s53, s19, 31 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s22 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s23 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s57 -; GFX9-HSA-NEXT: s_ashr_i32 s53, s19, 31 ; GFX9-HSA-NEXT: s_ashr_i32 s54, s18, 31 ; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:176 ; GFX9-HSA-NEXT: s_ashr_i32 s51, s17, 31 @@ -4745,6 +4749,8 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s41 ; GFX9-HSA-NEXT: s_ashr_i32 s38, s2, 31 +; GFX9-HSA-NEXT: v_mov_b32_e32 v5, s28 +; GFX9-HSA-NEXT: s_ashr_i32 s28, s28, 31 ; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:48 ; GFX9-HSA-NEXT: s_ashr_i32 s33, s1, 31 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s4 @@ -4752,18 +4758,17 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s39 ; GFX9-HSA-NEXT: s_ashr_i32 s34, s0, 31 -; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s26 -; GFX9-HSA-NEXT: v_mov_b32_e32 v9, s27 +; GFX9-HSA-NEXT: v_mov_b32_e32 v6, s28 +; GFX9-HSA-NEXT: s_ashr_i32 s28, s29, 31 ; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:32 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[7:10], s[36:37] offset:208 +; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s29 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s35 -; GFX9-HSA-NEXT: v_mov_b32_e32 v10, s24 -; GFX9-HSA-NEXT: v_mov_b32_e32 v12, s25 +; GFX9-HSA-NEXT: v_mov_b32_e32 v8, s28 ; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[1:4], s[36:37] offset:16 -; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[10:13], s[36:37] offset:192 +; GFX9-HSA-NEXT: global_store_dwordx4 v0, v[5:8], s[36:37] offset:224 ; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s1 @@ -5070,6 +5075,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xe0 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 @@ -5077,29 +5083,12 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xd0 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0 -; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s0 -; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xb0 -; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0 -; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xa0 -; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0 -; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0x90 -; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -5112,20 +5101,36 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX7-HSA-NEXT: s_add_u32 s24, s36, 0xc0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s25, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-HSA-NEXT: s_add_u32 s22, s36, 0xb0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s23, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-HSA-NEXT: s_add_u32 s20, s36, 0xa0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s21, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-HSA-NEXT: s_add_u32 s18, s36, 0x90 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s19, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: s_add_u32 s16, s36, 0x80 @@ -5742,55 +5747,56 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v32i32: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 +; GFX7-HSA-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 +; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x70 -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 +; GFX7-HSA-NEXT: s_add_u32 s24, s36, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 +; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s25, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: s_add_u32 s18, s16, 64 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX7-HSA-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-HSA-NEXT: s_add_u32 s20, s36, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GFX7-HSA-NEXT: s_addc_u32 s21, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-HSA-NEXT: s_add_u32 s16, s36, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-HSA-NEXT: s_addc_u32 s17, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: s_add_u32 s12, s16, 48 +; GFX7-HSA-NEXT: s_add_u32 s12, s36, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-HSA-NEXT: s_addc_u32 s13, s37, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 @@ -5798,9 +5804,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 32 +; GFX7-HSA-NEXT: s_add_u32 s8, s36, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s37, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 @@ -5808,20 +5814,20 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 16 +; GFX7-HSA-NEXT: s_add_u32 s4, s36, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GFX7-HSA-NEXT: s_addc_u32 s5, s37, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s37 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 542b0ccedbf14..32dcecacecaff 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -653,55 +653,56 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX7-LABEL: constant_load_v16i64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GFX7-NEXT: s_add_i32 s12, s12, s17 -; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 -; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 +; GFX7-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 +; GFX7-NEXT: s_add_u32 s0, s36, 0x70 +; GFX7-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-NEXT: v_mov_b32_e32 v7, s1 +; GFX7-NEXT: v_mov_b32_e32 v6, s0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-NEXT: v_mov_b32_e32 v7, s11 -; GFX7-NEXT: v_mov_b32_e32 v8, s4 -; GFX7-NEXT: v_mov_b32_e32 v9, s5 -; GFX7-NEXT: v_mov_b32_e32 v10, s6 -; GFX7-NEXT: v_mov_b32_e32 v11, s7 -; GFX7-NEXT: v_mov_b32_e32 v12, s0 -; GFX7-NEXT: v_mov_b32_e32 v13, s1 -; GFX7-NEXT: v_mov_b32_e32 v14, s2 -; GFX7-NEXT: v_mov_b32_e32 v15, s3 -; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GFX7-NEXT: s_add_u32 s18, s16, 0x70 -; GFX7-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-NEXT: v_mov_b32_e32 v16, s18 -; GFX7-NEXT: v_mov_b32_e32 v17, s19 -; GFX7-NEXT: s_add_u32 s18, s16, 0x60 -; GFX7-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GFX7-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-NEXT: s_add_u32 s18, s16, 0x50 +; GFX7-NEXT: v_mov_b32_e32 v0, s28 +; GFX7-NEXT: v_mov_b32_e32 v1, s29 +; GFX7-NEXT: v_mov_b32_e32 v2, s30 +; GFX7-NEXT: v_mov_b32_e32 v3, s31 +; GFX7-NEXT: v_mov_b32_e32 v4, s24 +; GFX7-NEXT: s_add_u32 s24, s36, 0x60 +; GFX7-NEXT: v_mov_b32_e32 v5, s25 +; GFX7-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 +; GFX7-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GFX7-NEXT: s_addc_u32 s25, s37, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s24 +; GFX7-NEXT: v_mov_b32_e32 v6, s26 +; GFX7-NEXT: v_mov_b32_e32 v7, s27 +; GFX7-NEXT: v_mov_b32_e32 v1, s25 ; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX7-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-NEXT: s_add_u32 s18, s16, 64 -; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; GFX7-NEXT: s_addc_u32 s19, s17, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GFX7-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-NEXT: s_add_u32 s20, s36, 0x50 +; GFX7-NEXT: v_mov_b32_e32 v1, s21 +; GFX7-NEXT: s_addc_u32 s21, s37, 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-NEXT: v_mov_b32_e32 v3, s23 +; GFX7-NEXT: v_mov_b32_e32 v5, s21 +; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-NEXT: s_nop 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-NEXT: s_add_u32 s16, s36, 64 +; GFX7-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-NEXT: s_addc_u32 s17, s37, 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s16 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-NEXT: s_add_u32 s12, s16, 48 +; GFX7-NEXT: s_add_u32 s12, s36, 48 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-NEXT: s_addc_u32 s13, s17, 0 +; GFX7-NEXT: s_addc_u32 s13, s37, 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-NEXT: v_mov_b32_e32 v3, s15 @@ -709,9 +710,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_nop 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: s_add_u32 s8, s16, 32 +; GFX7-NEXT: s_add_u32 s8, s36, 32 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-NEXT: s_addc_u32 s9, s37, 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s11 @@ -719,20 +720,20 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_nop 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: s_add_u32 s4, s16, 16 +; GFX7-NEXT: s_add_u32 s4, s36, 16 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: s_addc_u32 s5, s17, 0 +; GFX7-NEXT: s_addc_u32 s5, s37, 0 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-NEXT: v_mov_b32_e32 v4, s16 +; GFX7-NEXT: v_mov_b32_e32 v4, s36 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-NEXT: v_mov_b32_e32 v5, s37 ; GFX7-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 2d60c5729ed52..87bdf8f95d7e7 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -2487,47 +2487,47 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s21, s8, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s22, s9, 24 ; GFX7-HSA-NEXT: s_bfe_u32 s23, s9, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s24, s10, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s25, s10, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s11, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s27, s11, 0x80008 -; GFX7-HSA-NEXT: s_and_b32 s28, s4, 0xff +; GFX7-HSA-NEXT: s_and_b32 s24, s4, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s29, s5, 0xff +; GFX7-HSA-NEXT: s_and_b32 s25, s5, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s30, s6, 0xff +; GFX7-HSA-NEXT: s_and_b32 s26, s6, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s31, s7, 0xff +; GFX7-HSA-NEXT: s_and_b32 s27, s7, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s33, s8, 0xff +; GFX7-HSA-NEXT: s_and_b32 s28, s8, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s34, s9, 0xff +; GFX7-HSA-NEXT: s_and_b32 s29, s9, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s9, s9, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s35, s10, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s30, s10, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s31, s10, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s33, s10, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s10, s10, 0x80010 +; GFX7-HSA-NEXT: s_lshr_b32 s34, s11, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s35, s11, 0x80008 ; GFX7-HSA-NEXT: s_and_b32 s36, s11, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s11, s11, 0x80010 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s24 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s30 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 @@ -2539,7 +2539,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20 @@ -2548,7 +2548,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 @@ -2557,21 +2557,21 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s16 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 @@ -2953,114 +2953,114 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: -; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_ashr_i32 s12, s4, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s13, s4, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s14, s4, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s15, s5, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s16, s5, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s17, s5, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s18, s6, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s19, s6, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s20, s6, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s21, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s22, s7, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s23, s7, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s24, s8, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s25, s8, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s26, s8, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s27, s9, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s28, s9, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s29, s9, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s30, s10, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s31, s10, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s33, s10, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s34, s11, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s35, s11, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s36, s11, 0x80008 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 -; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX7-HSA-NEXT: s_ashr_i32 s12, s0, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s13, s0, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s14, s0, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s15, s1, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s16, s1, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s17, s1, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s18, s2, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s19, s2, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s20, s2, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s21, s3, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s22, s3, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s23, s3, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s24, s4, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s25, s4, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s26, s4, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s27, s5, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s28, s5, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s29, s5, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s30, s6, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s31, s6, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s33, s6, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s34, s7, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s35, s7, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s36, s7, 0x80008 +; GFX7-HSA-NEXT: s_add_u32 s10, s8, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 +; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s8, 0x60 +; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s1 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-HSA-NEXT: s_add_u32 s0, s8, 0x50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s33 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s30 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s8, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: s_sext_i32_i8 s8, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: s_sext_i32_i8 s3, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: s_sext_i32_i8 s2, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -3374,14 +3374,16 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 +; GFX6-NOHSA-NEXT: s_mov_b32 s18, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s0, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s19, s0, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s1, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s21, s1, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s2, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s23, s2, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s0, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s21, s0, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s1, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s23, s1, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s2, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s2, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s3, 24 ; GFX6-NOHSA-NEXT: s_bfe_u32 s27, s3, 0x80008 ; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 24 ; GFX6-NOHSA-NEXT: s_bfe_u32 s29, s4, 0x80008 @@ -3397,143 +3399,139 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_bfe_u32 s40, s9, 0x80008 ; GFX6-NOHSA-NEXT: s_lshr_b32 s41, s10, 24 ; GFX6-NOHSA-NEXT: s_bfe_u32 s42, s10, 0x80008 +; GFX6-NOHSA-NEXT: s_and_b32 s43, s15, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s44, s15, 0x80008 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s43 ; GFX6-NOHSA-NEXT: s_lshr_b32 s43, s11, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s44 +; GFX6-NOHSA-NEXT: s_bfe_u32 s44, s15, 0x80010 +; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s15, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NOHSA-NEXT: s_and_b32 s44, s14, 0xff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NOHSA-NEXT: s_bfe_u32 s15, s14, 0x80008 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s44 ; GFX6-NOHSA-NEXT: s_bfe_u32 s44, s11, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s45, s12, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s46, s12, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s47, s13, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s48, s13, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s49, s14, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s50, s14, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s51, s15, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s52, s15, 0x80008 -; GFX6-NOHSA-NEXT: s_and_b32 s26, s0, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s0, 0x80010 -; GFX6-NOHSA-NEXT: s_and_b32 s53, s1, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s54, s1, 0x80010 -; GFX6-NOHSA-NEXT: s_and_b32 s55, s2, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s56, s2, 0x80010 -; GFX6-NOHSA-NEXT: s_and_b32 s57, s3, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s58, s3, 0x80010 -; GFX6-NOHSA-NEXT: s_and_b32 s59, s4, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX6-NOHSA-NEXT: s_and_b32 s60, s5, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010 -; GFX6-NOHSA-NEXT: s_and_b32 s61, s6, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX6-NOHSA-NEXT: s_and_b32 s62, s7, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010 -; GFX6-NOHSA-NEXT: s_and_b32 s63, s8, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s8, s8, 0x80010 -; GFX6-NOHSA-NEXT: s_and_b32 s64, s9, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s9, s9, 0x80010 -; GFX6-NOHSA-NEXT: s_and_b32 s65, s10, 0xff -; GFX6-NOHSA-NEXT: s_and_b32 s66, s11, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010 -; GFX6-NOHSA-NEXT: s_and_b32 s67, s12, 0xff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s15 +; GFX6-NOHSA-NEXT: s_bfe_u32 s15, s14, 0x80010 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s14, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s15 +; GFX6-NOHSA-NEXT: s_and_b32 s15, s13, 0xff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s14 +; GFX6-NOHSA-NEXT: s_bfe_u32 s14, s13, 0x80008 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s15 +; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s12, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s14 +; GFX6-NOHSA-NEXT: s_bfe_u32 s14, s13, 0x80010 +; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s13, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s14 +; GFX6-NOHSA-NEXT: s_and_b32 s14, s12, 0xff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s13 +; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s12, 0x80008 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s14 +; GFX6-NOHSA-NEXT: s_and_b32 s14, s0, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s13 +; GFX6-NOHSA-NEXT: s_and_b32 s13, s1, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s12, s12, 0x80010 -; GFX6-NOHSA-NEXT: s_and_b32 s68, s13, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s13, 0x80010 -; GFX6-NOHSA-NEXT: s_and_b32 s69, s14, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s14, s14, 0x80010 -; GFX6-NOHSA-NEXT: s_and_b32 s70, s15, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s15, s15, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010 -; GFX6-NOHSA-NEXT: s_mov_b32 s0, s16 -; GFX6-NOHSA-NEXT: s_mov_b32 s1, s17 -; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s70 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s51 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s69 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s50 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s49 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s68 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s47 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s67 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s46 +; GFX6-NOHSA-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s45 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s66 +; GFX6-NOHSA-NEXT: s_and_b32 s12, s2, 0xff +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s15 +; GFX6-NOHSA-NEXT: s_and_b32 s15, s11, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s15 +; GFX6-NOHSA-NEXT: s_and_b32 s15, s3, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s44 +; GFX6-NOHSA-NEXT: s_and_b32 s44, s4, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s11 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s65 +; GFX6-NOHSA-NEXT: s_and_b32 s11, s5, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s43 +; GFX6-NOHSA-NEXT: s_and_b32 s43, s10, 0xff +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s43 +; GFX6-NOHSA-NEXT: s_and_b32 s43, s6, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s42 +; GFX6-NOHSA-NEXT: s_and_b32 s42, s7, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NOHSA-NEXT: s_and_b32 s10, s8, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s8, s8, 0x80010 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: s_and_b32 s41, s9, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s9, s9, 0x80010 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:224 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:208 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[16:19], 0 offset:192 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[16:19], 0 offset:176 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s41 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s40 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s39 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s63 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s38 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s62 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s42 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s61 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s43 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s34 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s31 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s30 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s44 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s28 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s57 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s15 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s24 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s55 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s13 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s56 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s22 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s53 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s54 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s20 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32: @@ -3552,160 +3550,160 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 24 ; GFX7-HSA-NEXT: s_bfe_u32 s23, s2, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s25, s3, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s26, s3, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s4, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s29, s4, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s31, s5, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s33, s5, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s35, s6, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s37, s6, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s38, s7, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s39, s7, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s41, s8, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s43, s8, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s44, s9, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s46, s9, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s47, s10, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s48, s10, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s49, s11, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s50, s11, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s51, s12, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s52, s12, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s53, s13, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s54, s13, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s55, s14, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s56, s14, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s57, s15, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s58, s15, 0x80008 ; GFX7-HSA-NEXT: s_and_b32 s24, s0, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s27, s1, 0xff +; GFX7-HSA-NEXT: s_and_b32 s26, s1, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s30, s2, 0xff +; GFX7-HSA-NEXT: s_and_b32 s27, s2, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s2, s2, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s34, s3, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s28, s3, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s29, s3, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s3, s3, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s36, s4, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s30, s4, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s31, s4, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s33, s4, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s4, s4, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s40, s5, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s34, s5, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s35, s5, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s36, s5, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s42, s6, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s37, s6, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s38, s6, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s39, s6, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s45, s7, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s40, s7, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s41, s7, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s42, s7, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s59, s8, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s60, s8, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s61, s9, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s62, s9, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s63, s10, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s43, s8, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s44, s8, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s45, s8, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s46, s8, 0x80010 +; GFX7-HSA-NEXT: s_lshr_b32 s47, s9, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s48, s9, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s49, s9, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s50, s9, 0x80010 +; GFX7-HSA-NEXT: s_lshr_b32 s51, s10, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s52, s10, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s53, s10, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s10, s10, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s64, s11, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s54, s11, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s55, s11, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s56, s11, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s11, s11, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s65, s12, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s57, s12, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s58, s12, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s59, s12, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s12, s12, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s66, s13, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s60, s13, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s61, s13, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s62, s13, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s13, s13, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s67, s14, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s63, s14, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s64, s14, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s65, s14, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s14, s14, 0x80010 +; GFX7-HSA-NEXT: s_lshr_b32 s66, s15, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s67, s15, 0x80008 ; GFX7-HSA-NEXT: s_and_b32 s68, s15, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s15, s15, 0x80010 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s67 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s66 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s65 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[4:7] ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s66 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s61 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s9 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[8:11] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[1:2], v[12:15] ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s9 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s51 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x70 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s47 ; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s42 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[16:19] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s40 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x70 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[24:27] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s49 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s67 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s55 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[20:23] +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s37 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -3713,16 +3711,16 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 @@ -3734,13 +3732,13 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s22 ; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20 @@ -4293,166 +4291,164 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GFX6-NOHSA-NEXT: s_mov_b32 s19, 0xf000 +; GFX6-NOHSA-NEXT: s_mov_b32 s18, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_ashr_i32 s18, s0, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s19, s0, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s20, s0, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s21, s0 -; GFX6-NOHSA-NEXT: s_ashr_i32 s22, s1, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s23, s1, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s24, s1, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s25, s1 +; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s0, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s21, s0, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s22, s0, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s0, s0 +; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s1, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s24, s1, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s25, s1, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s1, s1 ; GFX6-NOHSA-NEXT: s_ashr_i32 s26, s2, 24 ; GFX6-NOHSA-NEXT: s_bfe_i32 s27, s2, 0x80010 ; GFX6-NOHSA-NEXT: s_bfe_i32 s28, s2, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s29, s2 -; GFX6-NOHSA-NEXT: s_ashr_i32 s30, s3, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s31, s3, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s33, s3, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s34, s3 -; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s4, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s36, s4, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s37, s4, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s2, s2 +; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s3, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s30, s3, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s31, s3, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s3, s3 +; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s4, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s34, s4, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s35, s4, 0x80008 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s4, s4 -; GFX6-NOHSA-NEXT: s_ashr_i32 s38, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s39, s5, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s40, s5, 0x80008 +; GFX6-NOHSA-NEXT: s_ashr_i32 s36, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s37, s5, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s38, s5, 0x80008 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s5, s5 -; GFX6-NOHSA-NEXT: s_ashr_i32 s41, s6, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s42, s6, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s43, s6, 0x80008 +; GFX6-NOHSA-NEXT: s_ashr_i32 s39, s6, 24 +; GFX6-NOHSA-NEXT: s_bfe_i32 s40, s6, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_i32 s41, s6, 0x80008 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s6, s6 -; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s45, s7, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s46, s7, 0x80008 +; GFX6-NOHSA-NEXT: s_ashr_i32 s42, s7, 24 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s43, s15 +; GFX6-NOHSA-NEXT: s_bfe_i32 s44, s15, 0x80008 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s43 +; GFX6-NOHSA-NEXT: s_bfe_i32 s43, s7, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s44 +; GFX6-NOHSA-NEXT: s_bfe_i32 s44, s15, 0x80010 +; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s15, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s44, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NOHSA-NEXT: s_bfe_i32 s15, s7, 0x80008 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s44 +; GFX6-NOHSA-NEXT: s_bfe_i32 s44, s14, 0x80008 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s7, s7 -; GFX6-NOHSA-NEXT: s_ashr_i32 s47, s8, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s48, s8, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s49, s8, 0x80008 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s44 +; GFX6-NOHSA-NEXT: s_bfe_i32 s44, s14, 0x80010 +; GFX6-NOHSA-NEXT: s_ashr_i32 s14, s14, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s44 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s44, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s14 +; GFX6-NOHSA-NEXT: s_bfe_i32 s14, s13, 0x80008 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s44 +; GFX6-NOHSA-NEXT: s_ashr_i32 s44, s8, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s14 +; GFX6-NOHSA-NEXT: s_bfe_i32 s14, s13, 0x80010 +; GFX6-NOHSA-NEXT: s_ashr_i32 s13, s13, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s14 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s14, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s13 +; GFX6-NOHSA-NEXT: s_bfe_i32 s13, s12, 0x80008 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s14 +; GFX6-NOHSA-NEXT: s_bfe_i32 s14, s8, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s13 +; GFX6-NOHSA-NEXT: s_bfe_i32 s13, s12, 0x80010 +; GFX6-NOHSA-NEXT: s_ashr_i32 s12, s12, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s13 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s13, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s12 +; GFX6-NOHSA-NEXT: s_bfe_i32 s12, s8, 0x80008 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s13 +; GFX6-NOHSA-NEXT: s_bfe_i32 s13, s11, 0x80008 ; GFX6-NOHSA-NEXT: s_sext_i32_i8 s8, s8 -; GFX6-NOHSA-NEXT: s_ashr_i32 s50, s9, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s51, s9, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s52, s9, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s9, s9 -; GFX6-NOHSA-NEXT: s_ashr_i32 s53, s10, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s54, s10, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s55, s10, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s10, s10 -; GFX6-NOHSA-NEXT: s_bfe_i32 s56, s11, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s57, s11, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s58, s11 -; GFX6-NOHSA-NEXT: s_ashr_i32 s59, s12, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s60, s12, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s61, s12, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s12, s12 -; GFX6-NOHSA-NEXT: s_ashr_i32 s62, s13, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s63, s13, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s64, s13, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s13, s13 -; GFX6-NOHSA-NEXT: s_ashr_i32 s65, s14, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s66, s14, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s67, s14, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s14, s14 -; GFX6-NOHSA-NEXT: s_ashr_i32 s68, s15, 24 -; GFX6-NOHSA-NEXT: s_bfe_i32 s69, s15, 0x80010 -; GFX6-NOHSA-NEXT: s_bfe_i32 s70, s15, 0x80008 -; GFX6-NOHSA-NEXT: s_sext_i32_i8 s15, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s13 +; GFX6-NOHSA-NEXT: s_bfe_i32 s13, s11, 0x80010 ; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s11, 24 -; GFX6-NOHSA-NEXT: s_mov_b32 s0, s16 -; GFX6-NOHSA-NEXT: s_mov_b32 s1, s17 -; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s70 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s69 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s68 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s67 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s66 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s65 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s63 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s58 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s57 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s56 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s13 +; GFX6-NOHSA-NEXT: s_bfe_i32 s13, s10, 0x80008 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s55 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s54 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s53 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s11, s10 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s9, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NOHSA-NEXT: s_bfe_i32 s13, s10, 0x80010 +; GFX6-NOHSA-NEXT: s_ashr_i32 s10, s10, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NOHSA-NEXT: s_bfe_i32 s13, s9, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NOHSA-NEXT: s_bfe_i32 s10, s9, 0x80008 +; GFX6-NOHSA-NEXT: s_sext_i32_i8 s9, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:224 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:208 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[16:19], 0 offset:192 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[16:19], 0 offset:176 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s51 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s50 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s49 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s47 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s44 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s45 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s44 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s43 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s42 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s41 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s39 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s38 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s36 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s35 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s33 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s30 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s28 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s27 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s26 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s22 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s23 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s20 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32: @@ -4485,16 +4481,15 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_ashr_i32 s37, s6, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s38, s6, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s39, s6, 0x80008 -; GFX7-HSA-NEXT: s_sext_i32_i8 s40, s6 -; GFX7-HSA-NEXT: s_ashr_i32 s6, s7, 24 +; GFX7-HSA-NEXT: s_ashr_i32 s40, s7, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s41, s7, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s42, s7, 0x80008 ; GFX7-HSA-NEXT: s_ashr_i32 s43, s8, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s44, s8, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s45, s8, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s47, s9, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s48, s9, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s49, s9, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s48, s9, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s49, s9, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s50, s9, 0x80008 ; GFX7-HSA-NEXT: s_ashr_i32 s51, s10, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s52, s10, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s53, s10, 0x80008 @@ -4513,158 +4508,159 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_ashr_i32 s66, s15, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s67, s15, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s68, s15, 0x80008 -; GFX7-HSA-NEXT: s_sext_i32_i8 s46, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s50, s9 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s8 +; GFX7-HSA-NEXT: s_add_u32 s46, s16, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s47, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s47 +; GFX7-HSA-NEXT: s_add_u32 s46, s16, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s47, s17, 0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 +; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s10 +; GFX7-HSA-NEXT: s_add_u32 s10, s16, 0xd0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s11 +; GFX7-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s68 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s67 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s66 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s10 +; GFX7-HSA-NEXT: s_add_u32 s10, s16, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s46 +; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s11, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63 +; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s9 ; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s9 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[4:7] +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s62 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s61 ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s60 -; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] ; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s9 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s59 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s58 ; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[29:30], v[8:11] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[1:2], v[12:15] ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[25:26], v[12:15] -; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s9 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s8 ; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s51 ; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6 ; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 -; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s68 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s67 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s66 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s6 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[20:23] +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s55 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s7 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[27:28], v[16:19] -; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s50 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[21:22], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s49 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s43 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s6 +; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[16:19] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s8 +; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s1 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[24:27] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 +; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 64 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s35 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_sext_i32_i8 s3, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: s_sext_i32_i8 s4, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: s_sext_i32_i8 s2, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: s_sext_i32_i8 s3, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_sext_i32_i8 s1, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: s_sext_i32_i8 s2, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s16, 32 -; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s16, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 -; GFX7-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_sext_i32_i8 s0, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 @@ -7033,81 +7029,79 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s26, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s8, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s24, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s30, s5 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s7, 31 ; GFX6-NOHSA-NEXT: s_ashr_i32 s38, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[26:27], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s34 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s35 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s33 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s15 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s13 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s19 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s22 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s9 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s7 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s5 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64: @@ -7117,98 +7111,101 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s2, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s8, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s10, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s10, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s10, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s8, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s8, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s8, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s24, s11, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s11, 8 +; GFX7-HSA-NEXT: s_mov_b32 s28, s11 +; GFX7-HSA-NEXT: s_lshr_b32 s6, s9, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s2, s9, 8 +; GFX7-HSA-NEXT: s_mov_b32 s4, s9 +; GFX7-HSA-NEXT: s_ashr_i32 s29, s9, 31 +; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[8:9], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s16, s4, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8 -; GFX7-HSA-NEXT: s_ashr_i32 s27, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s29, s5, 24 -; GFX7-HSA-NEXT: s_mov_b32 s22, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s24, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s28, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i32 s33, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s34, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s33, s9, 24 ; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[24:25], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[28:29], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s28, s11, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s36, s11, 24 +; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX7-HSA-NEXT: s_add_u32 s24, s0, 0x50 ; GFX7-HSA-NEXT: s_addc_u32 s25, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s12, s0, 64 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: s_add_u32 s12, s0, 16 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX7-HSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60 +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -7621,134 +7618,135 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s16, s8, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s17, s9, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s10, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s19, s11, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s12, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s21, s13, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s22, s14, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s23, s15, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s24, s15, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s25, s14, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s26, s13, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s27, s12, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s28, s11, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s29, s10, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s7, s8, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s9, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s17, s10, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s11, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s19, s12, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s13, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s21, s13, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s22, s12, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s23, s11, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s6, s10, 0x80008 ; GFX7-HSA-NEXT: s_bfe_u32 s4, s9, 0x80008 ; GFX7-HSA-NEXT: s_bfe_u32 s2, s8, 0x80008 ; GFX7-HSA-NEXT: s_and_b32 s3, s8, 0xff ; GFX7-HSA-NEXT: s_and_b32 s5, s9, 0xff -; GFX7-HSA-NEXT: s_and_b32 s30, s10, 0xff -; GFX7-HSA-NEXT: s_and_b32 s31, s11, 0xff -; GFX7-HSA-NEXT: s_and_b32 s33, s12, 0xff -; GFX7-HSA-NEXT: s_and_b32 s34, s13, 0xff -; GFX7-HSA-NEXT: s_and_b32 s35, s14, 0xff -; GFX7-HSA-NEXT: s_and_b32 s36, s15, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010 -; GFX7-HSA-NEXT: s_bfe_u32 s9, s9, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s24, s10, 0xff +; GFX7-HSA-NEXT: s_and_b32 s25, s11, 0xff +; GFX7-HSA-NEXT: s_and_b32 s26, s12, 0xff +; GFX7-HSA-NEXT: s_and_b32 s27, s13, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s28, s8, 0x80010 +; GFX7-HSA-NEXT: s_bfe_u32 s29, s9, 0x80010 ; GFX7-HSA-NEXT: s_bfe_u32 s10, s10, 0x80010 ; GFX7-HSA-NEXT: s_bfe_u32 s11, s11, 0x80010 ; GFX7-HSA-NEXT: s_bfe_u32 s12, s12, 0x80010 ; GFX7-HSA-NEXT: s_bfe_u32 s13, s13, 0x80010 +; GFX7-HSA-NEXT: s_lshr_b32 s30, s14, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s31, s14, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s33, s14, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s14, s14, 0x80010 +; GFX7-HSA-NEXT: s_lshr_b32 s34, s15, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s35, s15, 0x80008 +; GFX7-HSA-NEXT: s_and_b32 s36, s15, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s15, s15, 0x80010 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xb0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x90 -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s9 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 0x90 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x50 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 0x70 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 48 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 16 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 0xe0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 0xc0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0xa0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 0xa0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x80 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 0x80 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: s_add_u32 s8, s0, 0x60 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64 +; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8205,158 +8203,160 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s2, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s2, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s0, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s0, 24 -; GFX6-NOHSA-NEXT: s_mov_b32 s34, s7 -; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s1, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s13, s1, 24 -; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s3, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s3, 24 -; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s49, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s7, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s0, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s46, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s3, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s3, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s50, s3 -; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s1, 16 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[0:1], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_lshr_b32 s4, s1, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s6, s1 -; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8 -; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s58 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s56 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s57 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s39 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s54 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s55 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s53 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s19 -; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s8 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:144 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[50:51], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[46:47], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[44:45], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[42:43], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[40:41], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s2, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s0, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s0, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s0, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s62, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s56, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s3, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s64, s3 +; GFX6-NOHSA-NEXT: s_lshr_b32 s54, s1, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s58, s1, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s12, s1 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[66:67], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[68:69], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s1, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s1, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[0:1], s[64:65], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s3, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s3, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s68 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s69 +; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s25, s5, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s66 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s67 +; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s7, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s60 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[2:3], s[58:59], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s61 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[54:55], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s52 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[50:51], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s53 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s29 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s27 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s62 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s63 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s25 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s23 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s57 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s28 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s29 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v28, s24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v29, s25 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[26:29], off, s[8:11], 0 offset:208 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v28, s19 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s47 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s45 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v29, s21 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:192 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s49 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s43 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s41 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s17 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s31 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:128 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s23 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s39 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s27 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s27 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s29 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s35 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[8:11], 0 offset:16 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s39 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s41 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[8:11], 0 offset:240 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s43 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[8:11], 0 offset:224 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s45 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[8:11], 0 offset:176 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s46 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s47 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[8:11], 0 offset:160 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s49 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[26:29], off, s[8:11], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s3 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[8:11], 0 offset:32 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: @@ -8368,154 +8368,154 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s12, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 24 -; GFX7-HSA-NEXT: s_ashr_i32 s33, s1, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s37, s1, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s34, s0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s30, s0, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s0, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s66, s1, 8 -; GFX7-HSA-NEXT: s_mov_b32 s68, s1 -; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[12:13], 0x80000 -; GFX7-HSA-NEXT: s_lshr_b32 s36, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 16 -; GFX7-HSA-NEXT: s_ashr_i32 s41, s3, 31 +; GFX7-HSA-NEXT: s_lshr_b32 s48, s4, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s50, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s52, s4, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s54, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s56, s2, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s42, s2, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s24, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s16, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s58, s3, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s60, s3, 8 -; GFX7-HSA-NEXT: s_mov_b32 s62, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s52, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s54, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s40, s2, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s36, s0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s34, s0, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s28, s0, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s3, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s3, 8 +; GFX7-HSA-NEXT: s_mov_b32 s10, s3 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s1, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s1, 8 +; GFX7-HSA-NEXT: s_mov_b32 s20, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s56, s4, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s58, s5 +; GFX7-HSA-NEXT: s_lshr_b32 s60, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s62, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s64, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s66, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s68, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s70, s7 +; GFX7-HSA-NEXT: s_ashr_i32 s67, s7, 24 +; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[0:1], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s33, s1, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s42, s1, 24 +; GFX7-HSA-NEXT: s_ashr_i32 s43, s3, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s44, s3, 24 +; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[4:5], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i32 s45, s5, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s46, s5, 24 -; GFX7-HSA-NEXT: s_ashr_i32 s47, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s48, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[2:3], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[70:71], s[4:5], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[72:73], s[6:7], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[66:67], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[60:61], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s47, s7, 31 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[20:21], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[18:19], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[14:15], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[58:59], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[22:23], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[70:71], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[68:69], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[66:67], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[40:41], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[36:37], 0x80000 -; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s63, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 -; GFX7-HSA-NEXT: s_add_u32 s60, s8, 0xc0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s61 -; GFX7-HSA-NEXT: s_addc_u32 s61, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s50 -; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51 -; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 -; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x80 -; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s38 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s39 -; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[64:65], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 +; GFX7-HSA-NEXT: s_add_u32 s64, s8, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s65, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s30 +; GFX7-HSA-NEXT: s_add_u32 s30, s8, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s31 +; GFX7-HSA-NEXT: s_addc_u32 s31, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s65 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s31 +; GFX7-HSA-NEXT: s_add_u32 s30, s8, 0x90 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s31, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s72 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: s_add_u32 s34, s8, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s73 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX7-HSA-NEXT: s_addc_u32 s35, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s39 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 -; GFX7-HSA-NEXT: s_add_u32 s22, s8, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23 -; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x80 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[4:7] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[1:2], v[8:11] +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s25 +; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x50 +; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s25 +; GFX7-HSA-NEXT: s_add_u32 s24, s8, 64 +; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s25 +; GFX7-HSA-NEXT: s_add_u32 s24, s8, 16 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[3:4], v[12:15] +; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s25 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[16:19] +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s22 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[9:10], v[20:23] ; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xf0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s23 -; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xe0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s71 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s29 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[11:12], v[24:27] +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s23 ; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s21 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s67 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s23 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[13:14], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s20 +; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s21 +; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xb0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 @@ -8542,7 +8542,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_addc_u32 s13, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8561,7 +8561,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s42 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -11386,68 +11386,69 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_and_b32 s22, s1, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s12, s0, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s23, s1, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s22, s22, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s21, s0, 24 -; GFX7-HSA-NEXT: s_or_b32 s22, s23, s22 -; GFX7-HSA-NEXT: s_and_b32 s23, s0, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s13, s1, 24 -; GFX7-HSA-NEXT: s_or_b32 s23, s23, s12 -; GFX7-HSA-NEXT: s_mov_b32 s12, s1 -; GFX7-HSA-NEXT: s_mov_b32 s1, s21 -; GFX7-HSA-NEXT: s_and_b32 s20, s3, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b64 s[12:13], s[12:13], 16 +; GFX7-HSA-NEXT: s_and_b32 s25, s1, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s16, s0, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s26, s1, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s25, s25, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s24, s0, 24 +; GFX7-HSA-NEXT: s_or_b32 s25, s26, s25 +; GFX7-HSA-NEXT: s_and_b32 s26, s0, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s16, s16, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s17, s1, 24 +; GFX7-HSA-NEXT: s_or_b32 s26, s26, s16 +; GFX7-HSA-NEXT: s_mov_b32 s16, s1 +; GFX7-HSA-NEXT: s_mov_b32 s1, s24 +; GFX7-HSA-NEXT: s_and_b32 s23, s3, 0xff00 +; GFX7-HSA-NEXT: s_lshr_b64 s[16:17], s[16:17], 16 ; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 -; GFX7-HSA-NEXT: s_and_b32 s19, s2, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s13, s0, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s14, s2, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s17, s0, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s3, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s20, 8 -; GFX7-HSA-NEXT: s_or_b32 s20, s0, s1 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s23, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s15, s3, 24 +; GFX7-HSA-NEXT: s_or_b32 s23, s0, s1 ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s19, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s2, 24 -; GFX7-HSA-NEXT: s_or_b32 s19, s0, s1 -; GFX7-HSA-NEXT: s_lshr_b32 s1, s3, 24 -; GFX7-HSA-NEXT: s_mov_b32 s0, s3 -; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 -; GFX7-HSA-NEXT: s_mov_b32 s3, s18 -; GFX7-HSA-NEXT: s_and_b32 s17, s5, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s21, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8 +; GFX7-HSA-NEXT: s_mov_b32 s14, s3 +; GFX7-HSA-NEXT: s_or_b32 s24, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[14:15], 16 +; GFX7-HSA-NEXT: s_mov_b32 s3, s22 +; GFX7-HSA-NEXT: s_and_b32 s21, s5, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s14, s0, 0xff00ff ; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16 -; GFX7-HSA-NEXT: s_and_b32 s10, s4, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s12, s4, 0xff00 ; GFX7-HSA-NEXT: s_and_b32 s2, s0, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s5, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s17, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s16, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s11, s5, 24 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s21, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s13, s5, 24 ; GFX7-HSA-NEXT: s_or_b32 s3, s0, s1 ; GFX7-HSA-NEXT: s_and_b32 s0, s4, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s10, 8 -; GFX7-HSA-NEXT: s_mov_b32 s10, s5 -; GFX7-HSA-NEXT: s_or_b32 s17, s0, s1 -; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[10:11], 16 -; GFX7-HSA-NEXT: s_mov_b32 s5, s16 -; GFX7-HSA-NEXT: s_and_b32 s15, s7, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s10, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshl_b32 s1, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 s12, s5 +; GFX7-HSA-NEXT: s_or_b32 s15, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[12:13], 16 +; GFX7-HSA-NEXT: s_mov_b32 s5, s20 +; GFX7-HSA-NEXT: s_and_b32 s19, s7, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s12, s0, 0xff00ff ; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[4:5], 16 -; GFX7-HSA-NEXT: s_and_b32 s14, s6, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s10, s6, 0xff00 ; GFX7-HSA-NEXT: s_and_b32 s4, s0, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s7, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s15, 8 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s19, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s11, s7, 24 ; GFX7-HSA-NEXT: s_or_b32 s5, s0, s1 ; GFX7-HSA-NEXT: s_and_b32 s0, s6, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8 -; GFX7-HSA-NEXT: s_or_b32 s11, s0, s1 -; GFX7-HSA-NEXT: s_lshr_b32 s1, s7, 24 -; GFX7-HSA-NEXT: s_mov_b32 s0, s7 -; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 -; GFX7-HSA-NEXT: s_lshr_b32 s7, s6, 24 -; GFX7-HSA-NEXT: s_and_b32 s14, s0, 0xff00ff +; GFX7-HSA-NEXT: s_lshl_b32 s1, s10, 8 +; GFX7-HSA-NEXT: s_mov_b32 s10, s7 +; GFX7-HSA-NEXT: s_or_b32 s13, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[10:11], 16 +; GFX7-HSA-NEXT: s_mov_b32 s7, s18 +; GFX7-HSA-NEXT: s_and_b32 s10, s0, 0xff00ff ; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[6:7], 16 -; GFX7-HSA-NEXT: s_and_b32 s12, s12, 0xff00ff +; GFX7-HSA-NEXT: s_and_b32 s16, s16, 0xff00ff ; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48 @@ -11455,32 +11456,32 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index d23c49165ec70..7dd8c54363ae4 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -2216,40 +2216,40 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i32: @@ -2658,6 +2658,8 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 @@ -2672,41 +2674,40 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 16, v13 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v13 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v36, 0xffff, v12 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v3 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v2 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v1 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v7 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v8 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xffff, v12 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v11 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xffff, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v36, 0xffff, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v40, 0xffff, v14 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 @@ -2724,93 +2725,92 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[4:5] +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 32 -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[11:14] -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[16:19] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i32: @@ -2834,15 +2834,16 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v15 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -2863,7 +2864,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v9 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 @@ -3072,6 +3072,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 @@ -3101,24 +3103,23 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v10 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v11, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 16, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v8, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v14 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v15, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v14, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v12 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v12, 0, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v13 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 16, v12 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 16, v9 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v15 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 16, v14 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v15, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v14, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 @@ -3130,13 +3131,13 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 @@ -3162,9 +3163,13 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) @@ -3175,16 +3180,13 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v14 ; GCN-HSA-NEXT: v_bfe_i32 v17, v15, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v15, v14, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[15:18] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 @@ -3195,8 +3197,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v8 ; GCN-HSA-NEXT: v_bfe_i32 v17, v9, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[15:18] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[11:14] @@ -3209,11 +3210,11 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v4 ; GCN-HSA-NEXT: v_bfe_i32 v13, v5, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v11, v4, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[11:14] -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[7:10] +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v0 @@ -3223,8 +3224,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v5, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[3:6] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i32: @@ -3248,11 +3249,16 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v2 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v13 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v12 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v13 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 16, v12 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v15 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 16, v14 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v14, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 16, v1 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 16, v0 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v1, 0, 16 @@ -3269,17 +3275,12 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v10 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v11, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v9 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 16, v8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v15 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v14 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 16, v9 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 16, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 @@ -3513,139 +3514,113 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s11 -; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xffff, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v13 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s10 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s11 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v17 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v12 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v25, 0xffff, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v8 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(13) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(12) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, 0xffff, v0 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(11) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v30 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, 0xffff, v29 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v57, 0xffff, v28 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v55, 0xffff, v27 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(10) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 16, v34 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v33 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v62, 16, v32 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v31 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v29, 0xffff, v34 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v61, 0xffff, v32 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v59, 0xffff, v31 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(9) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v34, 16, v38 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v37 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v35 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v33, 0xffff, v38 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v37 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v36 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v19 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v19 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, 0xffff, v42 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v35, 0xffff, v41 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v21 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v21 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v20 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v23 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v33 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v32 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v32 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v37, 16, v35 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v36, 0xffff, v35 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v29 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v29 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v28 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v28 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v41, 16, v31 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 16, v30 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v40, 0xffff, v31 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v30 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 16, v13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v29, 0xffff, v13 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v12 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v44, 0xffff, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v14 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v2 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v49, 16, v9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v47, 16, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v48, 0xffff, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, 0xffff, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v25, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v33, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v21, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: @@ -3657,321 +3632,297 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s8, s2, 16 +; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 +; GCN-HSA-NEXT: s_add_u32 s6, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s2, 48 -; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 ; GCN-HSA-NEXT: s_add_u32 s10, s2, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[16:17] +; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 ; GCN-HSA-NEXT: s_add_u32 s10, s2, 0x50 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 ; GCN-HSA-NEXT: s_add_u32 s10, s2, 0x60 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[4:5] ; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[12:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s6 -; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[12:13] -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[14:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v25 -; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v24 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[32:35] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xd0 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xa0 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xb0 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xa0 ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x80 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v27 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v27 -; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v26 +; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[32:35] +; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x80 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v14 +; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[32:35] ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s12 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s14 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s10 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[24:27] -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11] -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v28 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v29 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v28 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v29 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v25 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v24 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s12 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v27 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v27 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v26 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s9 +; GCN-HSA-NEXT: s_waitcnt vmcnt(7) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v21 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v20 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v23 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[22:25] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v28 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v29 +; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v29 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v28 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v31 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v30 ; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v31 ; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v30 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v25, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v10 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[23:26] ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v21 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v23 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v22 +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v25, 0xffff, v17 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v29, 0xffff, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(12) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: s_waitcnt vmcnt(12) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v17 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[23:26] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v25, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v4 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[23:26] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[15:18] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v19 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v27, 0xffff, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[27:30] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s11 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v21 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v20 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v24 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v23 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v22 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v24 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v27 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v27 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v26 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, 0xffff, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v12 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v19, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v20, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v21, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xffff, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v8 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(12) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v49, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v40 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v53, 0xffff, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v29 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v57, 0xffff, v28 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v55, 0xffff, v27 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v34 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v28, 16, v33 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v62, 16, v32 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v60, 16, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, 0xffff, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v33 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v61, 0xffff, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v59, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v38 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v32, 16, v37 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v35 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xffff, v38 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v31, 0xffff, v37 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v36 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v37, 0xffff, v42 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v35, 0xffff, v41 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v29 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v29 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v28 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v28 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v31 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v30 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v17 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v19 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v37, 0xffff, v19 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v35, 0xffff, v18 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v21, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, 0xffff, v14 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v42, 16, v15 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v41, 0xffff, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v9 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xffff, v8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v31, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, 0xffff, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v64i16_to_v64i32: @@ -4335,124 +4286,113 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s11 -; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s7 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[40:43], off, s[4:7], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v11, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v8, 0, 16 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v35 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v34 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v35, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v34, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 16, v33 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 16, v32 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v33, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v32, 0, 16 +; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s10 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s11 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[18:21], off, s[4:7], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[22:25], off, s[4:7], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[26:29], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v39 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v38 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v39, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v38, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v37 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v36 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v37, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v36, 0, 16 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v43 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v42 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v43, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v42, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 16, v41 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v40 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v41, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v40, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v31 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v30 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v31, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v30, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v29 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v28 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v29, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v28, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v30, 16, v27 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v28, 16, v26 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v29, v27, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v27, v26, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 16, v25 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v24 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v25, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v24, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v23 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v22 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v23, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v22, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v21 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v20 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v20, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 16, v19 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 16, v18 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v19, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v19 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v18 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v19, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v17 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v21 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v21, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v20 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v20, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v23 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v23, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v22 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v22, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v25 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v25, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 16, v24 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v24, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v13 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v12 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v15 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v15, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v14 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v14, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 16, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 16, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v29 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v3 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v3, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 16, v28 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v29, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v28, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v27 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 16, v16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v16, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v26 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v27, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v26, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 16, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 16, v19 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v18 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v19, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v7, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v42, 16, v9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v40, 16, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v41, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v39, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v4 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v11 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v10 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v11, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v10, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32: @@ -4469,34 +4409,34 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 64 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 +; GCN-HSA-NEXT: s_add_u32 s6, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s8, s2, 32 +; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[8:9] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[16:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[16:17] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] @@ -4514,271 +4454,251 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s2 +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v34, 16, v31 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v32, 16, v30 -; GCN-HSA-NEXT: v_bfe_i32 v33, v31, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v31, v30, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[31:34] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 16, v31 +; GCN-HSA-NEXT: v_bfe_i32 v32, v31, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v30 +; GCN-HSA-NEXT: v_bfe_i32 v30, v30, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[30:33] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v20 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v21 -; GCN-HSA-NEXT: v_bfe_i32 v30, v21, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v28, v20, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[35:36], v[28:31] -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v23 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v22 -; GCN-HSA-NEXT: v_bfe_i32 v30, v23, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v28, v22, 0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[28:31] -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v13 +; GCN-HSA-NEXT: v_bfe_i32 v30, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v28, v12, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[28:31] +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v39, s3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v22, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v20, v12, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v14 ; GCN-HSA-NEXT: v_bfe_i32 v30, v15, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v14 ; GCN-HSA-NEXT: v_bfe_i32 v28, v14, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v14, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v22, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v20, v6, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[28:31] +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v9 +; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v10 +; GCN-HSA-NEXT: v_bfe_i32 v30, v11, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v28, v10, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[28:31] +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v10, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v14, v7, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15] +; GCN-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v27 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v26 +; GCN-HSA-NEXT: v_bfe_i32 v9, v27, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v7, v26, 0, 16 +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v21 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v20 +; GCN-HSA-NEXT: v_bfe_i32 v28, v21, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v26, v20, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v25 +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[26:29] +; GCN-HSA-NEXT: v_bfe_i32 v13, v25, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v28, 16, v23 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 16, v22 +; GCN-HSA-NEXT: v_bfe_i32 v27, v23, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v25, v22, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v24 +; GCN-HSA-NEXT: v_bfe_i32 v11, v24, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 16, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[25:28] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v23, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v21, v0, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v22, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v20, v2, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] -; GCN-HSA-NEXT: s_waitcnt vmcnt(11) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v8, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v32, 16, v17 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v30, 16, v16 +; GCN-HSA-NEXT: v_bfe_i32 v31, v17, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v29, v16, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[29:32] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v28, 16, v19 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 16, v18 +; GCN-HSA-NEXT: v_bfe_i32 v27, v19, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v25, v18, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v24 -; GCN-HSA-NEXT: v_bfe_i32 v12, v24, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 16, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v10 -; GCN-HSA-NEXT: v_bfe_i32 v23, v11, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v21, v10, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[25:28] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[21:24] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: s_waitcnt vmcnt(12) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v19 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v18 -; GCN-HSA-NEXT: v_bfe_i32 v2, v19, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v18, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v17 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v16 -; GCN-HSA-NEXT: v_bfe_i32 v19, v17, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v17, v16, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[21:24] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[17:20] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v5, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v25 -; GCN-HSA-NEXT: v_bfe_i32 v14, v25, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v27 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v26 -; GCN-HSA-NEXT: v_bfe_i32 v6, v27, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v26, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s11 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[23:26], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 16, v15 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v14 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v13 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 16, v12 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v14, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v10 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v25 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v24 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v25, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v24, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 16, v28 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v27 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v27, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v26 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v26, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 16, v29 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v29, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v28, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v31 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v31, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v30 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v30, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v37, 16, v15 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 16, v13 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 16, v12 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v36, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v14 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v1 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v32, 16, v3 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 16, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v31, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6) +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v38, 16, v17 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v36, 16, v16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v37, v17, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v35, v16, 0, 16 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v7 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v11 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v10 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v11, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 16, v23 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v21 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 16, v22 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v23, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v22, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v20 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v21, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v20, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 16, v19 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 16, v18 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v19, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v18, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v42, 16, v9 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v40, 16, v8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v41, v9, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v39, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(12) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v3 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v50, 16, v1 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v48, 16, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v49, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v47, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v36 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v35 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v36, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v35, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v7 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v46, 16, v5 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v44, 16, v4 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v45, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v43, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v54, 16, v26 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v52, 16, v25 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v53, v26, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v51, v25, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v58, 16, v24 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v56, 16, v23 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v57, v24, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v55, v23, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 16, v30 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 16, v29 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v30, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v29, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v62, 16, v28 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v60, 16, v27 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v61, v28, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v59, v27, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 16, v34 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 16, v33 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v34, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v33, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v32 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v31 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v32, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v31, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v34, 16, v38 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v32, 16, v37 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v38, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v31, v37, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v64i16_to_v64i32: @@ -6409,28 +6329,28 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 -; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 +; GCN-HSA-NEXT: v_bfe_i32 v4, v2, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v1 +; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v7, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -6592,140 +6512,140 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v34, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v36, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v38, v5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v1 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v34, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, v9 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, v9 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v39, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v5 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[14:17] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[17:20] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v5 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[14:17] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[10:13] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[20:23] -; GCN-HSA-NEXT: flat_store_dwordx4 v[3:4], v[14:17] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[6:9] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v30, 16, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v28, 0xffff, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[24:27] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[28:31] +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64: @@ -6740,50 +6660,50 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v9 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, v9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, v9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, v9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, v9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v32, v9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v34, v9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v38, v9 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v29 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v35, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v31, 0xffff, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xffff, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v16i16_to_v16i64: @@ -6978,52 +6898,51 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v7 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v7 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v18, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 16, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 16, v7 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v10, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v1 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v1 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 31, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v30, 16, v3 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v9, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v15, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v14, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v11, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 31, v28 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -7044,82 +6963,82 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v5 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v9, v18, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v8, v6, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v6, v11, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v11, v17, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v7 +; GCN-HSA-NEXT: v_bfe_i32 v10, v10, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-HSA-NEXT: v_bfe_i32 v6, v19, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10] -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 ; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v18, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_bfe_i32 v2, v17, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v15, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i64: @@ -7137,54 +7056,54 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v7 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v22, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v20, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v22, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v23, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v11, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v19, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 31, v29 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 31, v28 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 31, v30 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 @@ -7372,136 +7291,110 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s11 -; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[14:17], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[18:21], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[26:29], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v20 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v17 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v18 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v18 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v20 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v19 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v19 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v21 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v22 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, 0xffff, v22 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v24 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v24 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, 0xffff, v23 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v25 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, 0xffff, v25 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v26 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v28 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v28 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v27 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v41, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v39 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, v8 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v55, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v57, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v51, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v53, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v43, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v45, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v59, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v47, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v49, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v39 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v8 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v39 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v39 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v8 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v8 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v34 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v34 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v38, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v40, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v32 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v42, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v44, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v30 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v46, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v48, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v50, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v52, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v54, v8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v56, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v28 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 16, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v13, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v43, 16, v33 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v47, 16, v31 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v51, 16, v29 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v55, 16, v27 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v27 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v29 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xffff, v31 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v41, 0xffff, v33 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[53:56], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: @@ -7515,20 +7408,20 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 -; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[14:17], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 @@ -7543,123 +7436,133 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v38, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 -; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v40, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v42, v1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v28, 0xffff, v7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[28:31] +; GCN-HSA-NEXT: v_mov_b32_e32 v33, v1 +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v30, 0xffff, v11 +; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[30:33] +; GCN-HSA-NEXT: v_mov_b32_e32 v36, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s12 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v30, 0xffff, v9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[35:36], v[30:33] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v9 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s12 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4 -; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v11 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[18:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6 -; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s9 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[17:20] -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v10 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[17:20] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[9:12] -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v13 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[37:38], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v45, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v38, s9 +; GCN-HSA-NEXT: s_waitcnt vmcnt(4) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v19 +; GCN-HSA-NEXT: v_mov_b32_e32 v44, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v15 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v36, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[23:26] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v33, 0xffff, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[31:32], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[44:45], v[33:36] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[37:38], v[26:29] +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[24:27] +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v30, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v28, 0xffff, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[28:31] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v37, 0xffff, v18 +; GCN-HSA-NEXT: v_mov_b32_e32 v38, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[37:40] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v41, 0xffff, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v44, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[21:24] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[41:44] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[18:21] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[17:20] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, v1 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[15:18] -; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[21:24] ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[5:8] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64: @@ -7672,97 +7575,94 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[13:16], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v21, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v15 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[13:16], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v10 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v40, 16, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xffff, v15 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xffff, v32 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, 0xffff, v34 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v53, v33 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v50, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v47, v33 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v41, 0xffff, v16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v44, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v41, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v33 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v35, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v38, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v32, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v38, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v41, v1 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v11 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xffff, v8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v9 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(5) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, 0xffff, v18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v1 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v19 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v16 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xffff, v16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v1 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v1 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v35, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v32, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v14 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i16_to_v32i64: @@ -8109,114 +8009,113 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v15 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v11 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v23, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v13 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v11 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v27, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v1 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v22, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v15 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v15 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v24, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v7 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v7 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v7 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v13 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v13 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v5 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v10, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v31, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v33, v14, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v40, 31, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 31, v11 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v36, 16, v11 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v29, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 31, v34 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v5 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v11 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v37, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v9 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v9 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v44, 31, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v7 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v41, v15, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v14, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v20, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v8, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v48, 31, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v3 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v45, v23, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v46, 31, v45 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v19, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v18, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v42, 31, v41 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 31, v28 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v32, 31, v31 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v34, 31, v33 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v35, v19, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v36, 31, v35 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v38, 31, v37 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v26, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v22, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v21, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v20, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v33, v16, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v34, 31, v33 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 31, v30 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: @@ -8237,10 +8136,10 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] @@ -8266,141 +8165,142 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[13:16] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 -; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[13:16] +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s3 +; GCN-HSA-NEXT: v_bfe_i32 v14, v17, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: s_waitcnt vmcnt(5) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v19, v6, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_bfe_i32 v12, v9, 0, 16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_bfe_i32 v16, v29, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v14, v18, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v10, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v10, v28, 0, 16 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_bfe_i32 v23, v23, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v21, v22, 0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: v_bfe_i32 v16, v4, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-HSA-NEXT: v_bfe_i32 v18, v18, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v23 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v8 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[19:22] +; GCN-HSA-NEXT: v_bfe_i32 v28, v5, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-HSA-NEXT: flat_store_dwordx4 v[29:30], v[23:26] +; GCN-HSA-NEXT: flat_store_dwordx4 v[31:32], v[16:19] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v5 +; GCN-HSA-NEXT: v_bfe_i32 v18, v4, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v30, 16, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(7) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v28 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[28:31] +; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v34, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v10, v20, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GCN-HSA-NEXT: v_bfe_i32 v16, v8, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v28, v2, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v22, v0, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v1 +; GCN-HSA-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v33, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_bfe_i32 v20, v10, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 16, v9 +; GCN-HSA-NEXT: v_bfe_i32 v24, v9, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v30, 16, v11 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_bfe_i32 v8, v21, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_bfe_i32 v24, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v26, v26, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_bfe_i32 v8, v34, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-HSA-NEXT: v_bfe_i32 v14, v27, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, v7 -; GCN-HSA-NEXT: v_bfe_i32 v18, v18, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v19, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v2, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v28 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[28:31] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v24 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64: @@ -8413,115 +8313,115 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[13:16], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[9:12], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[1:4], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v16, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v12 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v19, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v10 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v17, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v18, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v16, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, v30 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v24, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 31, v30 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 31, v32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v25 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v25 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v28, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v27, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v29, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v29 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 31, v28 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 31, v30 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v23 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v22, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v31, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 31, v29 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v32, 31, v31 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v23, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v17, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 31, v28 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 31, v30 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v24, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 31, v29 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v10 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v19, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, v11 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v35, v35, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v37, v36, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 31, v28 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v36, 31, v35 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v38, 31, v37 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:176 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 31, v32 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 31, v34 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 7203545ebf9a8..d0de09f23bcd7 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -2373,71 +2373,72 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: s_mov_b32 s10, s2 ; SI-NOHSA-NEXT: s_mov_b32 s11, s3 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; SI-NOHSA-NEXT: s_mov_b32 s0, s4 +; SI-NOHSA-NEXT: s_mov_b32 s1, s5 ; SI-NOHSA-NEXT: s_mov_b32 s8, s6 ; SI-NOHSA-NEXT: s_mov_b32 s9, s7 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(3) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v0 -; SI-NOHSA-NEXT: v_mov_b32_e32 v20, v0 -; SI-NOHSA-NEXT: v_mov_b32_e32 v22, v1 -; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v2 -; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v3 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v18, 31, v3 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v16, 31, v2 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v1 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v0 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v6 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4 -; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v4 -; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v5 -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v30, 31, v5 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v28, 31, v4 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(1) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 -; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v8 -; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v9 -; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v10 -; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v11 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v34, 31, v11 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v32, 31, v10 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12 -; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v12 -; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v13 -; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v14 -; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v15 -; SI-NOHSA-NEXT: s_mov_b32 s0, s4 -; SI-NOHSA-NEXT: s_mov_b32 s1, s5 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v38, 31, v13 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v36, 31, v12 +; SI-NOHSA-NEXT: v_mov_b32_e32 v35, v12 +; SI-NOHSA-NEXT: v_mov_b32_e32 v37, v13 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:96 +; SI-NOHSA-NEXT: s_waitcnt expcnt(0) +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v38, 31, v9 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v36, 31, v8 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v42, 31, v15 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v40, 31, v14 +; SI-NOHSA-NEXT: v_mov_b32_e32 v39, v14 +; SI-NOHSA-NEXT: v_mov_b32_e32 v41, v15 +; SI-NOHSA-NEXT: v_mov_b32_e32 v35, v8 +; SI-NOHSA-NEXT: v_mov_b32_e32 v37, v9 +; SI-NOHSA-NEXT: v_mov_b32_e32 v31, v10 +; SI-NOHSA-NEXT: v_mov_b32_e32 v33, v11 +; SI-NOHSA-NEXT: v_mov_b32_e32 v27, v4 +; SI-NOHSA-NEXT: v_mov_b32_e32 v29, v5 +; SI-NOHSA-NEXT: v_mov_b32_e32 v23, v6 +; SI-NOHSA-NEXT: v_mov_b32_e32 v25, v7 +; SI-NOHSA-NEXT: v_mov_b32_e32 v19, v0 +; SI-NOHSA-NEXT: v_mov_b32_e32 v21, v1 +; SI-NOHSA-NEXT: v_mov_b32_e32 v15, v2 +; SI-NOHSA-NEXT: v_mov_b32_e32 v17, v3 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:112 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:64 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:80 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:32 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:16 ; SI-NOHSA-NEXT: s_endpgm ; ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 @@ -2463,9 +2464,13 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) @@ -2476,16 +2481,13 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4) ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 @@ -2496,8 +2498,7 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v14 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v15 -; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] @@ -2510,11 +2511,11 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6) ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v0 @@ -2524,8 +2525,8 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v3 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[4:7] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[8:11] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7] ; GCNX3-HSA-NEXT: s_endpgm ; ; GCNX3-NOHSA-LABEL: global_sextload_v16i32_to_v16i64: @@ -2538,56 +2539,54 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 -; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v24, v4 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v26, v5 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v20, v6 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v22, v7 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v1 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v2 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v3 -; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v8 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v9 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v10 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v11 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v18, 31, v3 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v16, 31, v2 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v1 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v12 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v13 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v14 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v15 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v38, 31, v13 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v36, 31, v12 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v35, v12 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v37, v13 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v42, 31, v15 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v40, 31, v14 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v39, v14 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v41, v15 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v0 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v30, 31, v5 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v28, 31, v4 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v34, 31, v11 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v32, 31, v10 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:96 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v31, v10 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v38, 31, v9 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v36, 31, v8 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v35, v8 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v37, v9 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v33, v11 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v27, v4 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v29, v5 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v23, v6 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v25, v7 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v19, v0 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v21, v1 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v15, v2 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v17, v3 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:112 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:64 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:80 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:32 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:16 ; GCNX3-NOHSA-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v16i32_to_v16i64: @@ -2681,56 +2680,54 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v43, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32 -; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v36, s[2:3] offset:48 -; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v36, s[2:3] offset:16 -; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v36, s[2:3] +; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v43, s[2:3] +; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v43, s[2:3] offset:16 +; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v43, s[2:3] offset:32 +; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v43, s[2:3] offset:48 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(2) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, v3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, v9 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v15 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[24:27], s[0:1] offset:96 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[20:23], s[0:1] offset:112 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:64 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:80 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[28:31], s[0:1] offset:32 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] offset:48 -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] -; GCN-HSA-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, v13 +; GCN-HSA-NEXT: global_store_dwordx4 v43, v[35:38], s[0:1] offset:96 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v39, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v41, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v10 +; GCN-HSA-NEXT: global_store_dwordx4 v43, v[39:42], s[0:1] offset:112 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, v11 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v3 +; GCN-HSA-NEXT: global_store_dwordx4 v43, v[35:38], s[0:1] offset:64 +; GCN-HSA-NEXT: global_store_dwordx4 v43, v[31:34], s[0:1] offset:80 +; GCN-HSA-NEXT: global_store_dwordx4 v43, v[27:30], s[0:1] offset:32 +; GCN-HSA-NEXT: global_store_dwordx4 v43, v[23:26], s[0:1] offset:48 +; GCN-HSA-NEXT: global_store_dwordx4 v43, v[19:22], s[0:1] +; GCN-HSA-NEXT: global_store_dwordx4 v43, v[15:18], s[0:1] offset:16 ; GCN-HSA-NEXT: s_endpgm %ld = load <16 x i32>, ptr addrspace(1) %in %ext = sext <16 x i32> %ld to <16 x i64> @@ -2828,25 +2825,25 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v1 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v3 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v5 @@ -2861,12 +2858,12 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[16:19] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11 ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[16:19] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12 @@ -3061,128 +3058,115 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; SI-NOHSA-LABEL: global_sextload_v32i32_to_v32i64: ; SI-NOHSA: ; %bb.0: -; SI-NOHSA-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; SI-NOHSA-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; SI-NOHSA-NEXT: s_mov_b32 s14, -1 -; SI-NOHSA-NEXT: s_mov_b32 s15, 0xe8f000 -; SI-NOHSA-NEXT: s_add_u32 s12, s12, s11 -; SI-NOHSA-NEXT: s_addc_u32 s13, s13, 0 -; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; SI-NOHSA-NEXT: s_mov_b32 s2, -1 -; SI-NOHSA-NEXT: s_mov_b32 s10, s2 -; SI-NOHSA-NEXT: s_mov_b32 s11, s3 +; SI-NOHSA-NEXT: s_mov_b32 s6, s2 +; SI-NOHSA-NEXT: s_mov_b32 s7, s3 ; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOHSA-NEXT: s_mov_b32 s8, s6 -; SI-NOHSA-NEXT: s_mov_b32 s9, s7 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:96 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(7) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v31 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v30 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(6) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v15 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v14 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v13 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v12 -; SI-NOHSA-NEXT: v_mov_b32_e32 v40, v12 -; SI-NOHSA-NEXT: v_mov_b32_e32 v42, v13 -; SI-NOHSA-NEXT: v_mov_b32_e32 v36, v14 -; SI-NOHSA-NEXT: v_mov_b32_e32 v38, v15 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v28 -; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v28 -; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v29 -; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v30 -; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v31 -; SI-NOHSA-NEXT: buffer_store_dword v44, off, s[12:15], 0 ; 4-byte Folded Spill -; SI-NOHSA-NEXT: buffer_store_dword v45, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; SI-NOHSA-NEXT: buffer_store_dword v46, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; SI-NOHSA-NEXT: buffer_store_dword v47, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; SI-NOHSA-NEXT: s_waitcnt vmcnt(9) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v7 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v6 +; SI-NOHSA-NEXT: s_mov_b32 s0, s8 +; SI-NOHSA-NEXT: s_mov_b32 s1, s9 +; SI-NOHSA-NEXT: s_mov_b32 s4, s10 +; SI-NOHSA-NEXT: s_mov_b32 s5, s11 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[10:13], off, s[4:7], 0 offset:112 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[14:17], off, s[4:7], 0 offset:96 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[18:21], off, s[4:7], 0 offset:80 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[22:25], off, s[4:7], 0 offset:64 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(5) +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v11 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v10 +; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v10 +; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v11 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[30:33], off, s[4:7], 0 offset:48 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:224 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v5 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v4 -; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v4 -; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v5 -; SI-NOHSA-NEXT: v_mov_b32_e32 v12, v6 -; SI-NOHSA-NEXT: v_mov_b32_e32 v14, v7 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(8) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v1 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v0 -; SI-NOHSA-NEXT: v_mov_b32_e32 v48, v0 -; SI-NOHSA-NEXT: v_mov_b32_e32 v50, v1 -; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v2 -; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v3 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v12 +; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v12 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v13 +; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v13 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:240 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v14 +; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v14 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v15 +; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v15 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v16 +; SI-NOHSA-NEXT: v_mov_b32_e32 v12, v16 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v17 +; SI-NOHSA-NEXT: v_mov_b32_e32 v14, v17 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:192 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v18 +; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v18 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v19 +; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v19 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v20 +; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v20 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v21 +; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v21 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 +; SI-NOHSA-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v22 +; SI-NOHSA-NEXT: v_mov_b32_e32 v12, v22 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v23 +; SI-NOHSA-NEXT: v_mov_b32_e32 v14, v23 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(7) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v19 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v18 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v17 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v16 -; SI-NOHSA-NEXT: v_mov_b32_e32 v52, v16 -; SI-NOHSA-NEXT: v_mov_b32_e32 v54, v17 -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v18 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v19 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(6) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v23 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v22 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v21 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v20 -; SI-NOHSA-NEXT: v_mov_b32_e32 v56, v20 -; SI-NOHSA-NEXT: v_mov_b32_e32 v58, v21 -; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v22 -; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v23 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v3 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v25 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:160 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v24 +; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v24 +; SI-NOHSA-NEXT: v_mov_b32_e32 v36, v25 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v2 +; SI-NOHSA-NEXT: s_waitcnt expcnt(0) +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v26, 31, v1 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(5) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v27 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v26 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v63, 31, v25 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v61, 31, v24 -; SI-NOHSA-NEXT: v_mov_b32_e32 v60, v24 -; SI-NOHSA-NEXT: v_mov_b32_e32 v62, v25 -; SI-NOHSA-NEXT: v_mov_b32_e32 v20, v26 -; SI-NOHSA-NEXT: v_mov_b32_e32 v22, v27 -; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v11 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v10 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 -; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 -; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v8 -; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v9 -; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v10 -; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v11 -; SI-NOHSA-NEXT: s_mov_b32 s0, s4 -; SI-NOHSA-NEXT: s_mov_b32 s1, s5 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192 -; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload -; SI-NOHSA-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; SI-NOHSA-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; SI-NOHSA-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:32 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 -; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:16 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v31 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v30 +; SI-NOHSA-NEXT: v_mov_b32_e32 v38, v30 +; SI-NOHSA-NEXT: v_mov_b32_e32 v40, v31 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v24, 31, v0 +; SI-NOHSA-NEXT: s_waitcnt expcnt(0) +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v16, 31, v7 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v30, 31, v33 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v28, 31, v32 +; SI-NOHSA-NEXT: v_mov_b32_e32 v27, v32 +; SI-NOHSA-NEXT: v_mov_b32_e32 v29, v33 +; SI-NOHSA-NEXT: s_waitcnt expcnt(0) +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v14, 31, v6 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v5 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v9 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:144 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v8 +; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v8 +; SI-NOHSA-NEXT: v_mov_b32_e32 v48, v9 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v4 +; SI-NOHSA-NEXT: s_waitcnt expcnt(0) +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v34, 31, v11 +; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v32, 31, v10 +; SI-NOHSA-NEXT: v_mov_b32_e32 v31, v10 +; SI-NOHSA-NEXT: v_mov_b32_e32 v33, v11 +; SI-NOHSA-NEXT: v_mov_b32_e32 v42, v4 +; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v5 +; SI-NOHSA-NEXT: v_mov_b32_e32 v13, v6 +; SI-NOHSA-NEXT: v_mov_b32_e32 v15, v7 +; SI-NOHSA-NEXT: v_mov_b32_e32 v23, v0 +; SI-NOHSA-NEXT: v_mov_b32_e32 v25, v1 +; SI-NOHSA-NEXT: v_mov_b32_e32 v19, v2 +; SI-NOHSA-NEXT: v_mov_b32_e32 v21, v3 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:96 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:64 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:80 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:32 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:48 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 +; SI-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16 ; SI-NOHSA-NEXT: s_endpgm ; ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64: @@ -3204,181 +3188,182 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x50 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[8:9] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s4 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s1 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v28 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, v28 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v29 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s0 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[32:35] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v31 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v30 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, v30 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v31 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v31 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[32:35] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v25 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v24 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v24 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v24 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v25 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v25 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[28:31] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v27 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v26 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v26 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v27 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v27 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[28:31] +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v12 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[28:31] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v39, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v38, s2 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v12 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v13 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v13 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v14 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v15 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v15 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v21 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v20 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v20 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v21 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v8 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v8 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v9 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v23 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v22 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v22 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v23 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[24:27] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v15 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v14 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v13 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v12 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v12 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v13 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, v14 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v15 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[24:27] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v20 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v11 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v10 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v10 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v11 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[12:15] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v21 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v20 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v21 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[13:14], v[8:11] +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v23 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v6 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v7 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v22 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v22 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, v23 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[23:26] -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[25:28] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v17 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v17 +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v17 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, v17 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[25:28] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v19 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v18 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v18 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v19 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[23:26] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v9 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, v8 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v9 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v5 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, v4 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v5 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[15:18] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[21:24] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v1 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v0 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v11 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, v1 +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v1 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v19 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v18 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v18 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, v19 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v7 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[25:28] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, v6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v7 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20] ; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v3 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v2 -; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v10 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, v3 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v3 +; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v2 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[9:12] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCNX3-HSA-NEXT: s_endpgm ; ; GCNX3-NOHSA-LABEL: global_sextload_v32i32_to_v32i64: @@ -3391,104 +3376,102 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6 ; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 -; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:32 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48 +; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4 ; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6) +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v8 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v9 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:224 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6) +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v12 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v10 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v10 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v11 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v11 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v12 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v13 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v13 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:240 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v14 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v14 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v15 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v15 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v11 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v10 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v16 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v16 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v17 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v17 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v18 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v18 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v19 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v19 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v15 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v14 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v13 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v12 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v40, v12 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v42, v13 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v36, v14 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v38, v15 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v8 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v8 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v9 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v10 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v11 -; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(5) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v5 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v4 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v44, v4 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v46, v5 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v6 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v7 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v20 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v20 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v21 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v21 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v23 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v22 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v36, v22 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v38, v23 ; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(4) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v2 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v3 -; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v19 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v19 -; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v23 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v1 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v48, v0 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v50, v1 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v18 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v17 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v16 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v52, v16 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v54, v17 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v18 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v22 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v21 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v20 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v56, v20 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v58, v21 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v22 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v23 -; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v27 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v26 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v40, 31, v24 -; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v38, 31, v31 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v36, 31, v30 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v37, v31 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v29 -; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v28 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v28 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v29 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v35, v30 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v39, v24 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v41, v25 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v19, v26 -; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v21, v27 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:48 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 -; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v25 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v24 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v20, v24 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v22, v25 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v27 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v42, v27 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v33 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:208 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v30, 31, v3 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v28, 31, v2 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v0 +; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6) +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:128 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v26 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v40, v26 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v12, 31, v6 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v18, 31, v5 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v32 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v24, v32 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v26, v33 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v16, 31, v4 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v35 +; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v34 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v36, v34 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v38, v35 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v15, v4 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v17, v5 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:112 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:80 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:32 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v11, v6 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v13, v7 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v7, v0 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, v1 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v27, v2 +; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v29, v3 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:48 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 +; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:16 ; GCNX3-NOHSA-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v32i32_to_v32i64: @@ -3656,233 +3639,108 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; EG-NEXT: MOV * T32.Z, T12.Y, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64: -; GCN-GFX900-HSA: ; %bb.0: -; GCN-GFX900-HSA-NEXT: s_mov_b64 s[22:23], s[2:3] -; GCN-GFX900-HSA-NEXT: s_mov_b64 s[20:21], s[0:1] -; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v12, 0 -; GCN-GFX900-HSA-NEXT: s_add_u32 s20, s20, s17 -; GCN-GFX900-HSA-NEXT: s_addc_u32 s21, s21, 0 -; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:96 -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[8:11], v12, s[2:3] offset:112 -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[25:28], v12, s[2:3] offset:80 -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[13:16], v12, s[2:3] offset:64 -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[17:20], v12, s[2:3] offset:48 -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] offset:32 -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] offset:16 -; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v6 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v7 -; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v11 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v10 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v9 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v8 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v37, v8 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v39, v9 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v33, v10 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v35, v11 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v4 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v5 -; GCN-GFX900-HSA-NEXT: buffer_store_dword v29, off, s[20:23], 0 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: s_nop 0 -; GCN-GFX900-HSA-NEXT: buffer_store_dword v30, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v31, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v32, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v16 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v15 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v14 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v13 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v45, v13 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v47, v14 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v15 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v16 -; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v20 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v19 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v52, 31, v18 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v50, 31, v17 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v49, v17 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v51, v18 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v13, v19 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v15, v20 -; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v22 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v21 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v53, v21 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v55, v22 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v17, v23 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v19, v24 -; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] -; GCN-GFX900-HSA-NEXT: s_nop 0 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: s_nop 0 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v36, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v28 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v27 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v26 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v41, v25 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v43, v26 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v27 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v28 -; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(11) -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v0 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v25, v0 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v1 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v57, v2 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v59, v3 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192 -; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v0, v23 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v2, v24 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v22 -; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v21 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v21 -; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v22 -; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:208 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:176 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[45:48], s[0:1] offset:128 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:144 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[49:52], s[0:1] offset:96 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[13:16], s[0:1] offset:112 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[53:56], s[0:1] offset:64 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[17:20], s[0:1] offset:80 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[25:28], s[0:1] offset:32 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[57:60], s[0:1] offset:48 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GCN-GFX900-HSA-NEXT: s_endpgm -; -; GCN-GFX908-HSA-LABEL: global_sextload_v32i32_to_v32i64: -; GCN-GFX908-HSA: ; %bb.0: -; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v12, 0 -; GCN-GFX908-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:96 -; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[8:11], v12, s[2:3] offset:112 -; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[25:28], v12, s[2:3] offset:80 -; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[13:16], v12, s[2:3] offset:64 -; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[17:20], v12, s[2:3] offset:48 -; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] offset:32 -; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] offset:16 -; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6 -; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v11 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v10 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v9 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v8 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v37, v8 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v39, v9 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v33, v10 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v35, v11 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v8, v4 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v10, v5 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v29, v6 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v31, v7 -; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v16 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v15 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v14 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v13 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v45, v13 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v47, v14 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v4, v15 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v6, v16 -; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(2) -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v20 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v19 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v52, 31, v18 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v50, 31, v17 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v49, v17 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v51, v18 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v13, v19 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v15, v20 -; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v22 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v21 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v53, v21 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v55, v22 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v17, v23 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v19, v24 -; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] -; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a0, v29 -; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a3, v32 -; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a1, v30 -; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a2, v31 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v28 -; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v36, a3 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v27 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v26 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v41, v25 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v43, v26 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v29, v27 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v31, v28 -; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v0 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v25, v0 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v27, v1 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v57, v2 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v59, v3 -; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v35, a2 -; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v34, a1 -; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v33, a0 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192 -; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v0, v23 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v2, v24 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v22 -; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v21 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v8, v21 -; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v10, v22 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:208 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:176 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[45:48], s[0:1] offset:128 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:144 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[49:52], s[0:1] offset:96 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[13:16], s[0:1] offset:112 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[53:56], s[0:1] offset:64 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[17:20], s[0:1] offset:80 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[25:28], s[0:1] offset:32 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[57:60], s[0:1] offset:48 -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] -; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 -; GCN-GFX908-HSA-NEXT: s_endpgm +; GCN-HSA-LABEL: global_sextload_v32i32_to_v32i64: +; GCN-HSA: ; %bb.0: +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 +; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: global_load_dwordx4 v[13:16], v12, s[2:3] offset:112 +; GCN-HSA-NEXT: global_load_dwordx4 v[17:20], v12, s[2:3] offset:96 +; GCN-HSA-NEXT: global_load_dwordx4 v[21:24], v12, s[2:3] offset:80 +; GCN-HSA-NEXT: global_load_dwordx4 v[25:28], v12, s[2:3] offset:64 +; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] +; GCN-HSA-NEXT: global_load_dwordx4 v[29:32], v12, s[2:3] offset:48 +; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:16 +; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v12, s[2:3] offset:32 +; GCN-HSA-NEXT: s_waitcnt vmcnt(7) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v14 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, v14 +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:224 +; GCN-HSA-NEXT: s_waitcnt vmcnt(7) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v17 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v17 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v18 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v18 +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240 +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[13:16], s[0:1] offset:192 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v19 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, v19 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v20 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, v20 +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v21 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v21 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v22 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v22 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v23 +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:208 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, v24 +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v25 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, v25 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v26 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, v26 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v28 +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[13:16], s[0:1] offset:160 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v27 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, v27 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, v28 +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v2 +; GCN-HSA-NEXT: s_waitcnt vmcnt(7) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v30 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v29 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, v29 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, v30 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v32 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v31 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, v31 +; GCN-HSA-NEXT: v_mov_b32_e32 v39, v32 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v3 +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[17:20], s[0:1] offset:176 +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[21:24], s[0:1] offset:128 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v6 +; GCN-HSA-NEXT: s_waitcnt vmcnt(7) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v9 +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:144 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v11 +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[25:28], s[0:1] offset:96 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v41, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v43, v11 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, v1 +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:112 +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:64 +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:80 +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:32 +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[21:24], s[0:1] offset:48 +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[17:20], s[0:1] +; GCN-HSA-NEXT: global_store_dwordx4 v12, v[13:16], s[0:1] offset:16 +; GCN-HSA-NEXT: s_endpgm %ld = load <32 x i32>, ptr addrspace(1) %in %ext = sext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, ptr addrspace(1) %out @@ -3904,29 +3762,29 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: s_mov_b32 s9, s7 ; SI-NOHSA-NEXT: s_mov_b32 s0, s4 ; SI-NOHSA-NEXT: s_mov_b32 s1, s5 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:112 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96 -; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96 +; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(5) -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5 +; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v8 +; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v9 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:64 ; SI-NOHSA-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:80 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0) -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7 +; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v10 +; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v11 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(8) expcnt(0) -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v8 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v9 +; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v12 +; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v13 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0) -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v10 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v11 +; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v14 +; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v15 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v32 @@ -3969,12 +3827,12 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v19 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0) -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v12 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v13 +; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4 +; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NOHSA-NEXT: s_waitcnt expcnt(0) -; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v14 -; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v15 +; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6 +; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7 ; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NOHSA-NEXT: s_endpgm ; @@ -3986,13 +3844,13 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[32:35], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCNX3-HSA-NEXT: s_add_u32 s8, s2, 48 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1] ; GCNX3-HSA-NEXT: s_addc_u32 s9, s3, 0 ; GCNX3-HSA-NEXT: s_add_u32 s10, s2, 64 ; GCNX3-HSA-NEXT: s_addc_u32 s11, s3, 0 @@ -4004,7 +3862,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[32:35], v[0:1] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[24:27], v[0:1] @@ -4013,111 +3871,109 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, s6 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, s7 +; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[6:7] ; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v28 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v29 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s1 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v30 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v31 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v33 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[0:3] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v34 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v35 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8) +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v28 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v29 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[0:3] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v30 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v31 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v24 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v25 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[0:3] ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] -; GCNX3-HSA-NEXT: s_nop 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v26 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v27 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 -; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] +; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v20 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v21 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[0:3] -; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v22 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v23 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[0:3] ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v22 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v23 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(11) -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v16 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v17 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCNX3-HSA-NEXT: s_nop 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v18 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v19 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v12 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v13 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v14 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v15 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v16 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v17 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[0:3] ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v14 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v15 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v18 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v19 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(13) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v9 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 @@ -4133,6 +3989,7 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCNX3-HSA-NEXT: s_waitcnt vmcnt(14) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v4 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -4724,3 +4581,6 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa } attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN-GFX900-HSA: {{.*}} +; GCN-GFX908-HSA: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index 0c399d65d01cc..cb166841c44cd 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -2088,36 +2088,36 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v3 -; GCN-HSA-NEXT: v_bfe_u32 v8, v3, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v3 -; GCN-HSA-NEXT: v_bfe_u32 v9, v3, 16, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 24, v3 +; GCN-HSA-NEXT: v_bfe_u32 v16, v3, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xff, v3 +; GCN-HSA-NEXT: v_bfe_u32 v17, v3, 16, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 24, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[15:18] +; GCN-HSA-NEXT: v_bfe_u32 v12, v2, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2 +; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 16, 8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GCN-HSA-NEXT: v_bfe_u32 v4, v0, 8, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[7:10] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 24, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v1 ; GCN-HSA-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v12, v2, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GCN-HSA-NEXT: v_bfe_u32 v5, v0, 16, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v1 ; GCN-HSA-NEXT: v_bfe_u32 v9, v1, 16, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2 -; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[3:6] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i8_to_v16i32: @@ -2324,38 +2324,38 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v3 -; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v9, v3, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v3 +; GCN-HSA-NEXT: v_bfe_i32 v14, v3, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v13, v3, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v3, 0, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[12:15] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 24, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2 +; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8 ; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v5, v0, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v1 ; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 8 -; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; @@ -2542,51 +2542,53 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v1, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 24, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v17, v2, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 24, v3 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v3, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v10, v0, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v14, v1, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v18, v2, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xff, v3 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v22, v3, 16, 8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 24, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v8, v0, 8, 8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 24, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v12, v1, 8, 8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 24, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v16, v2, 8, 8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 24, v3 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v20, v3, 8, 8 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v1, v4, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 24, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v25, v5, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v29, v6, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 24, v7 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v33, v7, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v2, v4, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xff, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v26, v5, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xff, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v30, v6, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xff, v7 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v34, v7, 16, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 24, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 24, v7 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v28, v7, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xff, v7 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v29, v7, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v24, v4, 8, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 24, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v28, v6, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xff, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v29, v6, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 24, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v28, v5, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v9, v0, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xff, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v29, v5, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v1, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xff, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v17, v2, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xff, v3 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v3, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xff, v4 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v25, v4, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v32i8_to_v32i32: @@ -2618,64 +2620,64 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v7 ; GCN-HSA-NEXT: v_bfe_u32 v9, v7, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v7 ; GCN-HSA-NEXT: v_bfe_u32 v10, v7, 16, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v6 ; GCN-HSA-NEXT: v_bfe_u32 v8, v6, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v6 ; GCN-HSA-NEXT: v_bfe_u32 v9, v6, 16, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v5 ; GCN-HSA-NEXT: v_bfe_u32 v7, v5, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v5 ; GCN-HSA-NEXT: v_bfe_u32 v8, v5, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[6:9] -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; GCN-HSA-NEXT: v_bfe_u32 v6, v4, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v7, v4, 16, 8 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GCN-HSA-NEXT: v_bfe_u32 v9, v4, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v4 +; GCN-HSA-NEXT: v_bfe_u32 v10, v4, 16, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GCN-HSA-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v6, v3, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GCN-HSA-NEXT: v_bfe_u32 v4, v1, 8, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v3 +; GCN-HSA-NEXT: v_bfe_u32 v17, v3, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v3 +; GCN-HSA-NEXT: v_bfe_u32 v18, v3, 16, 8 ; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v0 -; GCN-HSA-NEXT: v_bfe_u32 v10, v0, 16, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GCN-HSA-NEXT: v_bfe_u32 v5, v1, 16, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v2 ; GCN-HSA-NEXT: v_bfe_u32 v14, v2, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[3:6] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GCN-HSA-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GCN-HSA-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff, v0 +; GCN-HSA-NEXT: v_bfe_u32 v6, v0, 16, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GCN-HSA-NEXT: v_bfe_u32 v10, v1, 16, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i8_to_v32i32: @@ -2693,47 +2695,47 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 24, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 24, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v33, v7, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xff, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v34, v7, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 24, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v13, v1, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 24, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v17, v2, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 24, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v21, v3, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xff, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v10, v0, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v14, v1, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xff, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v18, v2, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xff, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v22, v3, 16, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v1, v4, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 24, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v25, v5, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v6, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v4 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v4, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xff, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v26, v5, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xff, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v30, v6, 16, 8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 24, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v28, v7, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xff, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v7, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 24, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 24, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v28, v6, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xff, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v6, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v24, v4, 8, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 24, v5 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v28, v5, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xff, v5 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v5, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v25, v4, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v8, v0, 8, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 24, v1 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v12, v1, 8, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 24, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v16, v2, 8, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 24, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v20, v3, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v7, 0xff, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v9, v0, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xff, v1 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v13, v1, 16, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xff, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v17, v2, 16, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xff, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v21, v3, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i8_to_v32i32: @@ -2953,6 +2955,8 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 24, v0 @@ -2967,32 +2971,32 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v2, 16, 8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v2, 8, 8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v2, 0, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 24, v7 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v7, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v7, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v7, 0, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 24, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 24, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v6, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v6, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v6, 0, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v3, 16, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 24, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v5, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v5, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v5, 0, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v3, 8, 8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v3, 0, 8 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 24, v4 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v4, 16, 8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v4, 8, 8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v4, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 24, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v5, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v5, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v5, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 24, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v6, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v29, v6, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v6, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 24, v7 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v7, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v33, v7, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v7, 0, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 @@ -3029,64 +3033,64 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 24, v7 ; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v8, v7, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_bfe_i32 v14, v3, 16, 8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v6 ; GCN-HSA-NEXT: v_bfe_i32 v8, v6, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v6, v6, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[6:9] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 24, v5 ; GCN-HSA-NEXT: v_bfe_i32 v7, v5, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[5:8] -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: v_bfe_i32 v13, v3, 8, 8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 24, v4 ; GCN-HSA-NEXT: v_bfe_i32 v6, v4, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v5, v4, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[4:7] -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v3 -; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v9, v3, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_bfe_i32 v12, v3, 0, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2 +; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 24, v0 ; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v5, v0, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[8:11] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v1 ; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 8 -; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i8_to_v32i32: @@ -3106,10 +3110,14 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 24, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 24, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v7, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v7, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v7, 0, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 24, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v7, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v7, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v7, 0, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 24, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v6, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v6, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v6, 0, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v0, 16, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v0, 8, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 8 @@ -3121,24 +3129,20 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v2, 16, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v2, 8, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v2, 0, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 24, v3 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v3, 16, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 24, v5 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v5, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v5, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v5, 0, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v3, 8, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v3, 0, 8 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 24, v4 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v4, 16, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v4, 8, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v4, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 24, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v5, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v5, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v5, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 24, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v6, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v6, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v6, 0, 8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 @@ -3399,12 +3403,6 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v64i8_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s11 -; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 @@ -3413,104 +3411,104 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 24, v13 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v5, v13, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v9, v12, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 24, v15 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v17, v15, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 24, v14 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v14, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff, v13 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v6, v13, 16, 8 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v10, v12, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff, v15 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v18, v15, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xff, v14 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v22, v14, 16, 8 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v1, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 24, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v33, v0, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 24, v3 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v37, v3, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v43, 24, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v41, v2, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v14, v1, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v34, v0, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v36, 0xff, v3 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v38, v3, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v40, 0xff, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v42, v2, 16, 8 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 24, v25 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v5, v25, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v47, 24, v24 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v45, v24, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v51, 24, v27 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v49, v27, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v55, 24, v26 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v53, v26, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff, v25 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v6, v25, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v44, 0xff, v24 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v46, v24, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v48, 0xff, v27 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v50, v27, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v52, 0xff, v26 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v54, v26, 16, 8 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 24, v29 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v25, v29, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 24, v28 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v57, v28, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v63, 24, v31 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v61, v31, 8, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v30 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v1, v30, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xff, v29 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v26, v29, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v56, 0xff, v28 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v58, v28, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xff, v31 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v62, v31, 16, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xff, v30 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v2, v30, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v5 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 24, v18 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v18, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xff, v18 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v22, v18, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v1, v5, 8, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 24, v19 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v19, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xff, v19 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v22, v19, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 24, v4 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 24, v16 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v20, v16, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xff, v16 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v16, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v16, v4, 8, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 24, v17 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v20, v17, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xff, v17 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v17, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 24, v7 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 24, v14 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v24, v14, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xff, v14 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v25, v14, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v20, v7, 8, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 24, v15 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v24, v15, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xff, v15 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v25, v15, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 24, v6 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 24, v12 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v24, v12, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xff, v12 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v25, v12, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v12, v6, 8, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 24, v13 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v24, v13, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xff, v13 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v25, v13, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 24, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 24, v10 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v28, v10, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xff, v10 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v29, v10, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v24, v9, 8, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 24, v11 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v28, v11, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xff, v11 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v29, v11, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 24, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v28, v8, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v2, v5, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xff, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v29, v8, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xff, v4 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v17, v4, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xff, v7 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v7, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v6, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xff, v9 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v25, v9, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -3526,12 +3524,12 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[4:5] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -3557,6 +3555,10 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s12 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v2 ; GCN-HSA-NEXT: v_bfe_u32 v17, v2, 8, 8 @@ -3564,241 +3566,224 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_u32 v18, v2, 16, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v14 ; GCN-HSA-NEXT: v_bfe_u32 v17, v14, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v14 ; GCN-HSA-NEXT: v_bfe_u32 v18, v14, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 24, v9 +; GCN-HSA-NEXT: v_bfe_u32 v26, v9, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v25, 0xff, v9 +; GCN-HSA-NEXT: v_bfe_u32 v27, v9, 16, 8 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v15 ; GCN-HSA-NEXT: v_bfe_u32 v17, v15, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v15 ; GCN-HSA-NEXT: v_bfe_u32 v18, v15, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[16:19] -; GCN-HSA-NEXT: v_bfe_u32 v15, v12, 8, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[29:30], v[25:28] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 24, v3 +; GCN-HSA-NEXT: v_bfe_u32 v26, v3, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v25, 0xff, v3 +; GCN-HSA-NEXT: v_bfe_u32 v27, v3, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 24, v12 +; GCN-HSA-NEXT: v_bfe_u32 v15, v12, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v12 ; GCN-HSA-NEXT: v_bfe_u32 v16, v12, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[25:28] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[14:17] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 24, v13 ; GCN-HSA-NEXT: v_bfe_u32 v15, v13, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v13 ; GCN-HSA-NEXT: v_bfe_u32 v16, v13, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17] -; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_bfe_u32 v13, v10, 8, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s11 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[14:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GCN-HSA-NEXT: v_bfe_u32 v13, v10, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v10 ; GCN-HSA-NEXT: v_bfe_u32 v14, v10, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v11 -; GCN-HSA-NEXT: v_bfe_u32 v13, v11, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v11 -; GCN-HSA-NEXT: v_bfe_u32 v14, v11, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v8 -; GCN-HSA-NEXT: v_bfe_u32 v11, v8, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v8 -; GCN-HSA-NEXT: v_bfe_u32 v12, v8, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v9 -; GCN-HSA-NEXT: v_bfe_u32 v11, v9, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v9 -; GCN-HSA-NEXT: v_bfe_u32 v12, v9, 16, 8 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[10:13] -; GCN-HSA-NEXT: v_bfe_u32 v9, v3, 8, 8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v3 -; GCN-HSA-NEXT: v_bfe_u32 v10, v3, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[8:11] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 24, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v0 -; GCN-HSA-NEXT: v_bfe_u32 v10, v0, 16, 8 -; GCN-HSA-NEXT: v_bfe_u32 v18, v1, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v1 -; GCN-HSA-NEXT: v_bfe_u32 v19, v1, 16, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 24, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v32, 24, v0 +; GCN-HSA-NEXT: v_bfe_u32 v30, v0, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v29, 0xff, v0 +; GCN-HSA-NEXT: v_bfe_u32 v31, v0, 16, 8 +; GCN-HSA-NEXT: v_bfe_u32 v26, v1, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v25, 0xff, v1 +; GCN-HSA-NEXT: v_bfe_u32 v27, v1, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v11 +; GCN-HSA-NEXT: v_bfe_u32 v14, v11, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v11 +; GCN-HSA-NEXT: v_bfe_u32 v15, v11, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[13:16] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GCN-HSA-NEXT: v_bfe_u32 v18, v8, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v8 +; GCN-HSA-NEXT: v_bfe_u32 v19, v8, 16, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[25:28] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v5 -; GCN-HSA-NEXT: v_bfe_u32 v14, v5, 8, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[8:11] -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v4 -; GCN-HSA-NEXT: v_bfe_u32 v9, v4, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v15, v5, 16, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v4 -; GCN-HSA-NEXT: v_bfe_u32 v10, v4, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[17:20] +; GCN-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 24, v6 +; GCN-HSA-NEXT: v_bfe_u32 v22, v6, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xff, v6 +; GCN-HSA-NEXT: v_bfe_u32 v23, v6, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[21:24] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 24, v6 -; GCN-HSA-NEXT: v_bfe_u32 v18, v6, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v6 -; GCN-HSA-NEXT: v_bfe_u32 v19, v6, 16, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[17:20] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 24, v7 -; GCN-HSA-NEXT: v_bfe_u32 v1, v7, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xff, v7 -; GCN-HSA-NEXT: v_bfe_u32 v2, v7, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 24, v7 +; GCN-HSA-NEXT: v_bfe_u32 v18, v7, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v7 +; GCN-HSA-NEXT: v_bfe_u32 v19, v7, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v4 +; GCN-HSA-NEXT: v_bfe_u32 v14, v4, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v4 +; GCN-HSA-NEXT: v_bfe_u32 v15, v4, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[13:16] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v5 +; GCN-HSA-NEXT: v_bfe_u32 v10, v5, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v5 +; GCN-HSA-NEXT: v_bfe_u32 v11, v5, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[13:16] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[29:32] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v64i8_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s11 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v17 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v1, v17, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v17 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v17, 16, 8 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v38 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v1, v38, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v38 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v38, 16, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v5, v16, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 24, v19 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v9, v19, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 24, v18 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v13, v18, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xff, v16 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v6, v16, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xff, v19 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v10, v19, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff, v18 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v14, v18, 16, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 24, v29 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v17, v29, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 24, v28 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v21, v28, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 24, v31 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v25, v31, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 24, v30 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v41, v30, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xff, v29 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v18, v29, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v22, v28, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xff, v31 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v26, v31, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xff, v30 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v42, v30, 16, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v33, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 24, v32 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v45, v32, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 24, v35 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v49, v35, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 24, v34 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v53, v34, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xff, v33 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v30, v33, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xff, v32 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v46, v32, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xff, v35 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v50, v35, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, 0xff, v34 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v54, v34, 16, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 24, v37 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v33, v37, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 24, v36 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v57, v36, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v63, 24, v39 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v61, v39, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xff, v37 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v34, v37, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, 0xff, v36 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v58, v36, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, 0xff, v39 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v62, v39, 16, 8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v5 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 24, v18 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v21, v18, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xff, v18 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v22, v18, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 24, v14 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 24, v19 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v21, v19, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xff, v19 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v22, v19, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v24, v14, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xff, v14 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v25, v14, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 24, v10 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 24, v15 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v24, v15, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xff, v15 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v25, v15, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v28, v10, 8, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 24, v12 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v24, v12, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xff, v12 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v25, v12, 16, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xff, v10 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v10, 16, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 24, v16 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v20, v16, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xff, v16 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v21, v16, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 24, v13 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v24, v13, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xff, v13 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v25, v13, 16, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 24, v11 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v28, v11, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xff, v11 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v11, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 24, v17 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v20, v17, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xff, v17 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v21, v17, 16, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 24, v9 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v24, v9, 8, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xff, v9 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 24, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v28, v8, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xff, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v8, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v25, v9, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v1, v5, 8, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 24, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v16, v4, 8, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 24, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 24, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v20, v7, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v12, v6, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v5 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v5, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xff, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v17, v4, 16, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xff, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v21, v7, 16, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xff, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v13, v6, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -4189,119 +4174,111 @@ define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v64i8_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s11 -; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[10:13], off, s[4:7], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 24, v11 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v11, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v11, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v11, 0, 8 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 24, v10 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v10, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v10, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v10, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 24, v13 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v13, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v13, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v13, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 24, v12 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v12, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v12, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v12, 0, 8 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 24, v17 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v17, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v29, v17, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v17, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 24, v16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v16, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v33, v16, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v16, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 24, v19 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v19, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v37, v19, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v19, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 24, v18 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v18, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v41, v18, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v18, 0, 8 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 24, v21 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v21, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v21, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v21, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 24, v20 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v20, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v45, v20, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v20, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 24, v23 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v23, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v49, v23, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v23, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 24, v22 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v22, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v53, v22, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v22, 0, 8 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 24, v25 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v25, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v25, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v25, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 24, v24 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v24, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v57, v24, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v24, 0, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 24, v27 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v27, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v61, v27, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v27, 0, 8 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 24, v5 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 24, v18 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v18, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v18, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v18, 0, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v5, 16, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 24, v19 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v19, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v19, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v19, 0, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v5, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v5, 0, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 24, v16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v16, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v16, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v16, 0, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 24, v26 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v26, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v26, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v26, 0, 8 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 24, v4 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 24, v17 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v17, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v17, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v17, 0, 8 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v4, 16, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 24, v14 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v14, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v14, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v14, 0, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v4, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v4, 0, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 24, v15 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v15, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v15, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v15, 0, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 24, v7 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 24, v12 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v12, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v12, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v12, 0, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v7, 16, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 24, v13 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v13, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v13, 0, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v7, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v7, 0, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 24, v10 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v10, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v10, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v10, 0, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 24, v6 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 24, v11 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v11, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v11, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v11, 0, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v6, 16, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 24, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v8, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v8, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v8, 0, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v6, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v6, 0, 8 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 24, v9 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v9, 16, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v9, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v9, 0, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -4324,8 +4301,8 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -4343,20 +4320,20 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 24, v15 ; GCN-HSA-NEXT: v_bfe_i32 v17, v15, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v16, v15, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v15, v15, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 24, v12 ; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v15, v12, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v14, v12, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[14:17] ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 @@ -4365,11 +4342,11 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 24, v13 ; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v14, v13, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[13:16] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 @@ -4385,211 +4362,197 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v11 ; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v12, v11, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v11, v11, 0, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 24, v8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v11, v8, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v10, v8, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 24, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 24, v8 +; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v14, v8, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v13, v8, 0, 8 ; GCN-HSA-NEXT: v_bfe_i32 v11, v9, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v10, v9, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v9, v9, 0, 8 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[9:12] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[13:16] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[9:12] ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v6 -; GCN-HSA-NEXT: v_bfe_i32 v13, v6, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v11, v6, 0, 8 +; GCN-HSA-NEXT: v_bfe_i32 v8, v6, 0, 8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v6 +; GCN-HSA-NEXT: v_bfe_i32 v10, v6, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v9, v6, 8, 8 +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v1 +; GCN-HSA-NEXT: v_bfe_i32 v14, v1, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v13, v1, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v12, v1, 0, 8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 24, v0 +; GCN-HSA-NEXT: v_bfe_i32 v18, v0, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v17, v0, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v16, v0, 0, 8 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 24, v7 ; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v8, v7, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 8 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v4 ; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 16, 8 ; GCN-HSA-NEXT: v_bfe_i32 v7, v4, 8, 8 ; GCN-HSA-NEXT: v_bfe_i32 v6, v4, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: s_waitcnt vmcnt(10) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v1 -; GCN-HSA-NEXT: v_bfe_i32 v13, v1, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v12, v1, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v11, v1, 0, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[6:9] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v0 -; GCN-HSA-NEXT: v_bfe_i32 v8, v0, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v7, v0, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 24, v5 -; GCN-HSA-NEXT: v_bfe_i32 v17, v5, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v15, v5, 0, 8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 24, v5 +; GCN-HSA-NEXT: v_bfe_i32 v9, v5, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v7, v5, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 24, v3 -; GCN-HSA-NEXT: v_bfe_i32 v17, v3, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v16, v3, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v15, v3, 0, 8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 24, v2 -; GCN-HSA-NEXT: v_bfe_i32 v4, v2, 16, 8 -; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 8, 8 -; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 24, v3 +; GCN-HSA-NEXT: v_bfe_i32 v22, v3, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v21, v3, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v20, v3, 0, 8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 24, v2 +; GCN-HSA-NEXT: v_bfe_i32 v5, v2, 16, 8 +; GCN-HSA-NEXT: v_bfe_i32 v4, v2, 8, 8 +; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 0, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v64i8_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s11 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[26:29], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[30:33], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[34:37], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 24, v11 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v11, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v11, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v11, 0, 8 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 24, v5 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 24, v18 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v18, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v18, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v18, 0, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v16, 0, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 24, v19 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v19, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v19, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v19, 0, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 24, v17 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 24, v16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v16, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v16, 8, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v17, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v17, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v17, 0, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 24, v36 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v36, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v36, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v36, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 24, v10 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v10, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v5, v10, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v10, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 24, v13 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v13, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v13, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v13, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 24, v12 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v12, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v12, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 24, v27 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v27, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v27, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v27, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 24, v26 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v26, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v26, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v26, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 24, v29 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v29, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v29, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v29, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v41, 24, v28 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v40, v28, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v39, v28, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v38, v28, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v45, 24, v31 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v44, v31, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v43, v31, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v42, v31, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v49, 24, v30 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v48, v30, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v47, v30, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v46, v30, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 24, v33 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v33, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v33, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v33, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v53, 24, v32 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v52, v32, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v51, v32, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v50, v32, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v57, 24, v35 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v35, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v55, v35, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v35, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v61, 24, v34 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v60, v34, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v59, v34, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v34, 0, 8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 24, v37 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v37, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v37, 8, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v37, 0, 8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 24, v11 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 24, v14 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v14, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v14, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v14, 0, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v11, 16, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 24, v15 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v15, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v15, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v15, 0, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v11, 8, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 24, v12 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v12, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v12, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v12, 0, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v11, 0, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 24, v13 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v13, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v13, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v13, 0, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v5, 16, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 24, v10 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v10, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v10, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v10, 0, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v5, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v5, 0, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 24, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v4, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v4, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v4, 0, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 24, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v7, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v7, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v7, 0, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 24, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v6, 16, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 24, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v8, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v8, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v8, 0, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v6, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v6, 0, 8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 24, v9 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v9, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v5, v9, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v9, 0, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -6076,39 +6039,39 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: flat_load_dwordx2 v[15:16], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 24, v16 ; GCN-HSA-NEXT: v_bfe_u32 v0, v16, 16, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v15 ; GCN-HSA-NEXT: v_bfe_u32 v3, v15, 16, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: v_bfe_u32 v9, v16, 8, 8 ; GCN-HSA-NEXT: v_bfe_u32 v13, v15, 8, 8 ; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v15 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[3:6] -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v8i8_to_v8i64: @@ -6551,49 +6514,52 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v1, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v17, v3, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xff, v3 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v2, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xff, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 24, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v23, v1, 16, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 24, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v27, v0, 16, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 24, v3 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v31, v3, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v3, v2, 16, 8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v34, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v5 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v7, v3, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v12, 24, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v19, v2, 8, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v10, v0, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v23, v3, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v27, v0, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v31, v1, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v29, 0xff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v25, 0xff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v21, 0xff, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xff, v2 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v1, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v16i8_to_v16i64: @@ -6611,72 +6577,72 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, v5 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, v5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v3 ; GCN-HSA-NEXT: v_bfe_u32 v8, v3, 16, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GCN-HSA-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 ; GCN-HSA-NEXT: v_bfe_u32 v11, v0, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v1 ; GCN-HSA-NEXT: v_bfe_u32 v14, v1, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 -; GCN-HSA-NEXT: v_bfe_u32 v10, v2, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GCN-HSA-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[14:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_bfe_u32 v13, v3, 8, 8 +; GCN-HSA-NEXT: v_bfe_u32 v19, v2, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s1 +; GCN-HSA-NEXT: v_bfe_u32 v22, v3, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xff, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s0 ; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v17, v1, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xff, v1 +; GCN-HSA-NEXT: v_bfe_u32 v26, v1, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xff, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[20:23] ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[24:27] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i8_to_v16i64: @@ -6690,49 +6656,49 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v5 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v32, v5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 24, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v28, v3, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v6, v1, 8, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 24, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v14, v0, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v8, v0, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xff, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v18, v3, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xff, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v22, v2, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xff, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 24, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v24, v2, 16, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 24, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v0, v1, 16, 8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v29 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v4, v3, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v7, v1, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v10, v2, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v15, v2, 8, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v19, v3, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xff, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v23, v1, 8, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 24, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v31, v0, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, 0xff, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v25, v0, 16, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v21, 0xff, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, 0xff, v3 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v16i8_to_v16i64: @@ -6938,81 +6904,79 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3 ; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0 ; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s6, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s7, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s4, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s7, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s7 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s7, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 24 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[26:27], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s22 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s7 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v16i8_to_v16i64: @@ -7025,8 +6989,8 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 ; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3 @@ -7035,32 +6999,24 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s8, s2, 24 ; GCN-HSA-NEXT: s_lshr_b32 s10, s2, 8 +; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s4, 24 +; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 8 ; GCN-HSA-NEXT: s_lshr_b32 s18, s3, 16 ; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 8 ; GCN-HSA-NEXT: s_mov_b32 s22, s3 +; GCN-HSA-NEXT: s_lshr_b32 s24, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s5, 8 +; GCN-HSA-NEXT: s_mov_b32 s28, s5 +; GCN-HSA-NEXT: s_ashr_i32 s27, s5, 31 +; GCN-HSA-NEXT: s_ashr_i32 s29, s5, 24 ; GCN-HSA-NEXT: s_ashr_i32 s7, s3, 31 ; GCN-HSA-NEXT: s_ashr_i32 s9, s3, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s4, 24 -; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 -; GCN-HSA-NEXT: s_ashr_i32 s4, s5, 24 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_lshr_b32 s2, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s3, s5, 31 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 -; GCN-HSA-NEXT: s_lshr_b32 s4, s5, 8 -; GCN-HSA-NEXT: s_mov_b32 s24, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[2:3], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[28:29], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[26:27], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 @@ -7068,68 +7024,76 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s9 ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 64 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[14:17] ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s19 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[6:9] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s21 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[14:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[10:13] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[18:21] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i8_to_v16i64: @@ -7148,16 +7112,14 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s4, 24 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s4, 24 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s4, 8 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s6, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s6, 24 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s6, 8 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s5, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 8 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s5 @@ -7172,10 +7134,8 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s5, 31 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s5, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000 @@ -7188,39 +7148,42 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s38 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s33 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s29 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s13 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v16i8_to_v16i64: @@ -7430,12 +7393,6 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v32i8_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s11 -; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0 ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 @@ -7444,137 +7401,96 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[17:20], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v2, v12, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v3, v11, 8, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v6, v10, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff, v10 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xff, v12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v55, v13, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xff, v13 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 24, v10 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v25, v10, 16, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 24, v11 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v29, v11, 16, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 24, v12 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v33, v12, 16, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 24, v13 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v37, v13, 16, 8 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 24, v20 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v43, v17, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v41, 0xff, v17 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v47, v18, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xff, v18 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v51, v19, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xff, v19 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v23, v20, 8, 8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v21, 0xff, v20 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 24, v17 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v57, v17, 16, 8 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 24, v18 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v4, v18, 16, 8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v19 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v0, v19, 16, 8 -; GCN-NOHSA-SI-NEXT: v_bfe_u32 v8, v20, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v58, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v60, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v38, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v40, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v34, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v36, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v50, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v52, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v46, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v48, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v42, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v44, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v54, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v56, v9 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v53, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v54, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v55, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v56, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 24, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v12, v2, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 24, v1 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v15, v1, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 24, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v18, v0, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v9 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 24, v7 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v7, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 24, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v24, v6, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 24, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v12, v5, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 24, v4 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v15, v4, 16, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v9 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v12 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v9 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v53, off, s[12:15], 0 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v54, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v55, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v56, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v54, v9 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v56, v9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v20, v3, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xff, v3 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v23, v2, 8, 8 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v21, 0xff, v2 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v8, v3, 16, 8 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[57:60], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v9 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v1, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v16, v0, 8, 8 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[53:56], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v1 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v26, v7, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v20, v6, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v30, v5, 8, 8 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v34, v4, 8, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xff, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xff, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xff, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xff, v7 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xff, v0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v32i8_to_v32i64: @@ -7586,134 +7502,151 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[3:6], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[7:10], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v1 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, v1 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GCN-HSA-NEXT: v_bfe_u32 v10, v4, 16, 8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GCN-HSA-NEXT: v_bfe_u32 v10, v3, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v1 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, v1 +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v5 +; GCN-HSA-NEXT: v_bfe_u32 v11, v5, 16, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[11:14] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v4 +; GCN-HSA-NEXT: v_bfe_u32 v14, v4, 16, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[14:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GCN-HSA-NEXT: v_bfe_u32 v10, v2, 16, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v3 +; GCN-HSA-NEXT: v_bfe_u32 v17, v3, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[17:20] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, v1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v9 -; GCN-HSA-NEXT: v_bfe_u32 v10, v9, 16, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 24, v10 +; GCN-HSA-NEXT: v_bfe_u32 v20, v10, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xb0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v8 -; GCN-HSA-NEXT: v_bfe_u32 v10, v8, 16, 8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v9 +; GCN-HSA-NEXT: v_bfe_u32 v11, v9, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v7 -; GCN-HSA-NEXT: v_bfe_u32 v10, v7, 16, 8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v8 +; GCN-HSA-NEXT: v_bfe_u32 v14, v8, 16, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[14:17] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GCN-HSA-NEXT: v_bfe_u32 v10, v6, 16, 8 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 -; GCN-HSA-NEXT: v_bfe_u32 v12, v5, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v5 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 -; GCN-HSA-NEXT: v_bfe_u32 v12, v4, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v4 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: v_bfe_u32 v12, v3, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 -; GCN-HSA-NEXT: v_bfe_u32 v12, v2, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[10:13] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_bfe_u32 v11, v9, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[9:12] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 24, v5 -; GCN-HSA-NEXT: v_bfe_u32 v0, v5, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v7 +; GCN-HSA-NEXT: v_bfe_u32 v17, v7, 16, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[17:20] +; GCN-HSA-NEXT: v_bfe_u32 v22, v6, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xff, v6 +; GCN-HSA-NEXT: v_bfe_u32 v11, v5, 8, 8 +; GCN-HSA-NEXT: v_bfe_u32 v15, v4, 8, 8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[20:23] +; GCN-HSA-NEXT: v_bfe_u32 v28, v9, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 24, v6 +; GCN-HSA-NEXT: v_bfe_u32 v19, v3, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v3 +; GCN-HSA-NEXT: v_bfe_u32 v0, v6, 16, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_bfe_u32 v24, v10, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xff, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[9:12] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[13:16] +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[17:20] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[22:25] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_bfe_u32 v16, v8, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v1 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[14:17] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[26:29] ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_bfe_u32 v9, v7, 8, 8 -; GCN-HSA-NEXT: v_bfe_u32 v13, v6, 8, 8 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-HSA-NEXT: v_bfe_u32 v32, v8, 8, 8 +; GCN-HSA-NEXT: v_bfe_u32 v36, v7, 8, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v30, 0xff, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[30:33] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[34:37] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i8_to_v32i64: @@ -7726,92 +7659,84 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[14:17], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, v53 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, v8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v8 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v32, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v34, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, v8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v8 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v9, v11, 8, 8 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 24, v15 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v48, v15, 16, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 24, v16 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v45, v16, 16, 8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v44, v16, 8, 8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, v53 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xff, v16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v41, v15, 8, 8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, v53 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, 0xff, v15 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 24, v14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, v53 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v36, v14, 16, 8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 24, v10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, v53 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v10, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v3, v10, 16, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v7, 0xff, v11 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v20, v12, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xff, v12 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 24, v12 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v22, v12, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v28, v13, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xff, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 24, v11 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v10, v11, 16, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 24, v13 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v13, 16, 8 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v35, v14, 8, 8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xff, v14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v16, v17, 8, 8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xff, v17 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 24, v17 -; GCN-NOHSA-VI-NEXT: v_bfe_u32 v52, v17, 16, 8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v53 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v53 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 24, v5 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v11, v5, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 24, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v14, v3, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 24, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v17, v1, 16, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v13, v6, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xff, v6 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v22, v7, 8, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xff, v7 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v16, v5, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xff, v5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 24, v6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v23, v6, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 24, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v17, v4, 16, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v22, v4, 8, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 24, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v7, v7, 16, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v25, v3, 8, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xff, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v10, v2, 16, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v28, v2, 8, 8 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v16, v1, 8, 8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 24, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_u32 v18, v0, 16, 8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xff, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xff, v2 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i8_to_v32i64: @@ -8175,166 +8100,166 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v2 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s19, v3 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s22, v0 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s23, v1 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s34, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s35, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s54, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s55, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s24, v6 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s25, v7 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v4 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s13, v5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s18, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s18, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s18, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s22, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s22, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s22, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s24, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s24, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s24, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s12, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s12, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s19, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[18:19], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[22:23], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[24:25], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s45 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s19, 8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s43 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s23, 16 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s52, v6 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s53, v7 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s20, v4 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s21, v5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s34, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s34, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s34, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s54, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s54, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s54, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s52, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s52, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s52, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s20, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s20, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s20, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s35, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s35, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s60, s35 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s55, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s55, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s62, s55 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s53, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s53, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s64, s53 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s21, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s21, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s21 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[20:21], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[58:59], s[52:53], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[54:55], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[68:69], s[34:35], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s21, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s21, 24 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[64:65], 0x80000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s53, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s11, s53, 24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s68 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[62:63], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s69 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s55, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s17, s55, 24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s66 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[60:61], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s67 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s19, s35, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s35, 24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s58 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[56:57], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s59 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s50 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s51 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s23 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s19 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s54 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s55 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s39 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s23, 8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s23 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[12:13], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s25, 16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s25, 8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s17 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s25, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s23, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s23, 24 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s19, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s19, 24 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s25, 24 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s15 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s52 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s53 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s13, 16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s27 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 8 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s13, 31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s13, 24 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s13 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[12:13], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[22:23], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[28:29], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[36:37], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[38:39], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[40:41], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s22 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s23 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[24:25], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, s16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, s17 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, s11 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, s13 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s19 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s41 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s27 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s29 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s31 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s19 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s37 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s39 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s29 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s43 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s45 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s49 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s47 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s35 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v32i8_to_v32i64: @@ -8347,223 +8272,222 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v6 -; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v4 -; GCN-HSA-NEXT: v_readfirstlane_b32 s9, v5 -; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v7 -; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 24 -; GCN-HSA-NEXT: s_lshr_b32 s10, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s8, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s16, s6, 8 -; GCN-HSA-NEXT: s_lshr_b32 s4, s8, 8 -; GCN-HSA-NEXT: s_lshr_b32 s12, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 8 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s6, s9, 16 -; GCN-HSA-NEXT: s_mov_b32 s28, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 -; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[18:19], 0x80000 +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3 +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-HSA-NEXT: s_lshr_b32 s22, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s2, 24 +; GCN-HSA-NEXT: s_lshr_b32 s26, s2, 8 +; GCN-HSA-NEXT: s_lshr_b32 s28, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s4, 24 +; GCN-HSA-NEXT: s_lshr_b32 s34, s4, 8 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[4:5], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x80000 +; GCN-HSA-NEXT: s_ashr_i32 s31, s5, 31 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_readfirstlane_b32 s40, v2 -; GCN-HSA-NEXT: v_readfirstlane_b32 s41, v3 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[10:11], 0x80000 -; GCN-HSA-NEXT: v_readfirstlane_b32 s44, v0 -; GCN-HSA-NEXT: v_readfirstlane_b32 s45, v1 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GCN-HSA-NEXT: s_mov_b32 s22, s7 -; GCN-HSA-NEXT: s_lshr_b32 s8, s9, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[16:17], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[14:15], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[12:13], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[4:5], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: s_lshr_b32 s42, s44, 16 -; GCN-HSA-NEXT: s_lshr_b32 s48, s44, 24 -; GCN-HSA-NEXT: s_lshr_b32 s28, s44, 8 -; GCN-HSA-NEXT: s_lshr_b32 s6, s45, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s45, 8 -; GCN-HSA-NEXT: s_mov_b32 s4, s45 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s44, s40, 16 -; GCN-HSA-NEXT: s_lshr_b32 s50, s40, 24 -; GCN-HSA-NEXT: s_lshr_b32 s52, s40, 8 -; GCN-HSA-NEXT: s_lshr_b32 s20, s41, 16 -; GCN-HSA-NEXT: s_lshr_b32 s12, s41, 8 -; GCN-HSA-NEXT: s_mov_b32 s14, s41 -; GCN-HSA-NEXT: s_ashr_i32 s33, s9, 31 -; GCN-HSA-NEXT: s_ashr_i32 s37, s7, 31 -; GCN-HSA-NEXT: s_ashr_i32 s38, s7, 24 -; GCN-HSA-NEXT: s_ashr_i32 s34, s9, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GCN-HSA-NEXT: v_readfirstlane_b32 s14, v4 +; GCN-HSA-NEXT: v_readfirstlane_b32 s15, v5 +; GCN-HSA-NEXT: v_readfirstlane_b32 s18, v6 +; GCN-HSA-NEXT: v_readfirstlane_b32 s19, v7 +; GCN-HSA-NEXT: s_lshr_b32 s16, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s8, s3, 8 +; GCN-HSA-NEXT: s_mov_b32 s10, s3 +; GCN-HSA-NEXT: s_lshr_b32 s6, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s38, s5, 8 +; GCN-HSA-NEXT: s_mov_b32 s40, s5 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[30:31], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[26:27], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_ashr_i32 s30, s45, 31 -; GCN-HSA-NEXT: s_ashr_i32 s31, s45, 24 -; GCN-HSA-NEXT: s_ashr_i32 s35, s41, 31 -; GCN-HSA-NEXT: s_ashr_i32 s36, s41, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: s_lshr_b32 s46, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s48, s14, 24 +; GCN-HSA-NEXT: s_lshr_b32 s50, s14, 8 +; GCN-HSA-NEXT: s_lshr_b32 s20, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s12, s15, 8 +; GCN-HSA-NEXT: s_mov_b32 s52, s15 +; GCN-HSA-NEXT: s_lshr_b32 s54, s18, 16 +; GCN-HSA-NEXT: s_lshr_b32 s56, s18, 24 +; GCN-HSA-NEXT: s_lshr_b32 s58, s18, 8 +; GCN-HSA-NEXT: s_lshr_b32 s60, s19, 16 +; GCN-HSA-NEXT: s_lshr_b32 s62, s19, 8 +; GCN-HSA-NEXT: s_mov_b32 s64, s19 +; GCN-HSA-NEXT: s_ashr_i32 s33, s5, 24 +; GCN-HSA-NEXT: s_ashr_i32 s36, s3, 31 +; GCN-HSA-NEXT: s_ashr_i32 s37, s3, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[40:41], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[38:39], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[14:15], 0x80000 +; GCN-HSA-NEXT: s_ashr_i32 s30, s15, 31 +; GCN-HSA-NEXT: s_ashr_i32 s38, s15, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[66:67], s[18:19], 0x80000 +; GCN-HSA-NEXT: s_ashr_i32 s39, s19, 31 +; GCN-HSA-NEXT: s_ashr_i32 s40, s19, 24 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[52:53], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GCN-HSA-NEXT: s_add_u32 s54, s0, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s55 -; GCN-HSA-NEXT: s_add_u32 s54, s0, 64 -; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s55 -; GCN-HSA-NEXT: s_add_u32 s54, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40 -; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s41 -; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[20:21], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[64:65], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[62:63], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[60:61], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[50:51], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[48:49], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[46:47], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[58:59], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[56:57], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[54:55], 0x80000 +; GCN-HSA-NEXT: s_add_u32 s50, s0, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s51, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s44 +; GCN-HSA-NEXT: s_add_u32 s44, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s45 +; GCN-HSA-NEXT: s_addc_u32 s45, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s42 +; GCN-HSA-NEXT: s_add_u32 s42, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s44 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s41 -; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xc0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s55 -; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: s_addc_u32 s43, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s43 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26 +; GCN-HSA-NEXT: s_add_u32 s26, s0, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 ; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 +; GCN-HSA-NEXT: s_add_u32 s26, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s24 -; GCN-HSA-NEXT: s_add_u32 s24, s0, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s25 -; GCN-HSA-NEXT: s_addc_u32 s25, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s18 -; GCN-HSA-NEXT: s_add_u32 s18, s0, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 -; GCN-HSA-NEXT: s_addc_u32 s19, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GCN-HSA-NEXT: s_add_u32 s16, s0, 48 +; GCN-HSA-NEXT: s_add_u32 s16, s0, 0x90 +; GCN-HSA-NEXT: flat_store_dwordx4 v[1:2], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s17 +; GCN-HSA-NEXT: s_add_u32 s16, s0, 0x80 +; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s17 +; GCN-HSA-NEXT: s_add_u32 s16, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s35 +; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[24:27] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xb0 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -12818,17 +12742,15 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 ; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3 ; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1 -; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_readfirstlane_b32 s21, v5 +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 ; GCN-HSA-NEXT: s_ashr_i32 s6, s5, 24 ; GCN-HSA-NEXT: s_bfe_i32 s7, s5, 0x80010 ; GCN-HSA-NEXT: s_bfe_i32 s8, s5, 0x80008 @@ -12843,11 +12765,11 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_sext_i32_i8 s3, s3 ; GCN-HSA-NEXT: s_ashr_i32 s15, s2, 24 ; GCN-HSA-NEXT: s_bfe_i32 s16, s2, 0x80010 -; GCN-HSA-NEXT: s_ashr_i32 s22, s21, 24 -; GCN-HSA-NEXT: s_bfe_i32 s23, s21, 0x80010 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_readfirstlane_b32 s18, v6 ; GCN-HSA-NEXT: v_readfirstlane_b32 s19, v7 ; GCN-HSA-NEXT: v_readfirstlane_b32 s20, v4 +; GCN-HSA-NEXT: v_readfirstlane_b32 s21, v5 ; GCN-HSA-NEXT: s_lshl_b32 s6, s6, 16 ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-HSA-NEXT: s_lshl_b32 s8, s8, 16 @@ -12860,57 +12782,59 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-HSA-NEXT: s_lshl_b32 s14, s14, 16 ; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-HSA-NEXT: s_bfe_i32 s17, s2, 0x80008 ; GCN-HSA-NEXT: s_lshl_b32 s15, s15, 16 ; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff -; GCN-HSA-NEXT: s_lshl_b32 s22, s22, 16 -; GCN-HSA-NEXT: s_and_b32 s23, s23, 0xffff -; GCN-HSA-NEXT: s_bfe_i32 s17, s2, 0x80008 +; GCN-HSA-NEXT: s_ashr_i32 s22, s21, 24 +; GCN-HSA-NEXT: s_bfe_i32 s23, s21, 0x80010 ; GCN-HSA-NEXT: s_bfe_i32 s24, s21, 0x80008 ; GCN-HSA-NEXT: s_sext_i32_i8 s21, s21 ; GCN-HSA-NEXT: s_ashr_i32 s25, s20, 24 +; GCN-HSA-NEXT: s_bfe_i32 s26, s20, 0x80010 +; GCN-HSA-NEXT: s_bfe_i32 s27, s20, 0x80008 +; GCN-HSA-NEXT: s_sext_i32_i8 s20, s20 ; GCN-HSA-NEXT: s_or_b32 s6, s7, s6 -; GCN-HSA-NEXT: s_bfe_i32 s7, s20, 0x80010 +; GCN-HSA-NEXT: s_ashr_i32 s7, s19, 24 ; GCN-HSA-NEXT: s_or_b32 s5, s5, s8 -; GCN-HSA-NEXT: s_bfe_i32 s8, s20, 0x80008 -; GCN-HSA-NEXT: s_sext_i32_i8 s20, s20 +; GCN-HSA-NEXT: s_bfe_i32 s8, s19, 0x80010 ; GCN-HSA-NEXT: s_or_b32 s9, s10, s9 -; GCN-HSA-NEXT: s_ashr_i32 s10, s19, 24 +; GCN-HSA-NEXT: s_bfe_i32 s10, s19, 0x80008 +; GCN-HSA-NEXT: s_sext_i32_i8 s19, s19 ; GCN-HSA-NEXT: s_or_b32 s4, s4, s11 -; GCN-HSA-NEXT: s_bfe_i32 s11, s19, 0x80010 +; GCN-HSA-NEXT: s_ashr_i32 s11, s18, 24 ; GCN-HSA-NEXT: s_or_b32 s12, s13, s12 -; GCN-HSA-NEXT: s_bfe_i32 s13, s19, 0x80008 -; GCN-HSA-NEXT: s_sext_i32_i8 s19, s19 +; GCN-HSA-NEXT: s_bfe_i32 s13, s18, 0x80010 ; GCN-HSA-NEXT: s_or_b32 s3, s3, s14 -; GCN-HSA-NEXT: s_ashr_i32 s14, s18, 24 -; GCN-HSA-NEXT: s_or_b32 s15, s16, s15 -; GCN-HSA-NEXT: s_bfe_i32 s16, s18, 0x80010 -; GCN-HSA-NEXT: s_or_b32 s22, s23, s22 -; GCN-HSA-NEXT: s_bfe_i32 s23, s18, 0x80008 +; GCN-HSA-NEXT: s_bfe_i32 s14, s18, 0x80008 ; GCN-HSA-NEXT: s_sext_i32_i8 s18, s18 ; GCN-HSA-NEXT: s_sext_i32_i8 s2, s2 ; GCN-HSA-NEXT: s_lshl_b32 s17, s17, 16 -; GCN-HSA-NEXT: s_lshl_b32 s24, s24, 16 +; GCN-HSA-NEXT: s_or_b32 s15, s16, s15 +; GCN-HSA-NEXT: s_lshl_b32 s16, s22, 16 +; GCN-HSA-NEXT: s_and_b32 s22, s23, 0xffff +; GCN-HSA-NEXT: s_lshl_b32 s23, s24, 16 ; GCN-HSA-NEXT: s_and_b32 s21, s21, 0xffff -; GCN-HSA-NEXT: s_lshl_b32 s25, s25, 16 -; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-HSA-NEXT: s_lshl_b32 s8, s8, 16 +; GCN-HSA-NEXT: s_lshl_b32 s24, s25, 16 +; GCN-HSA-NEXT: s_and_b32 s25, s26, 0xffff +; GCN-HSA-NEXT: s_lshl_b32 s26, s27, 16 ; GCN-HSA-NEXT: s_and_b32 s20, s20, 0xffff +; GCN-HSA-NEXT: s_lshl_b32 s7, s7, 16 +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-HSA-NEXT: s_lshl_b32 s10, s10, 16 -; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-HSA-NEXT: s_lshl_b32 s13, s13, 16 ; GCN-HSA-NEXT: s_and_b32 s19, s19, 0xffff +; GCN-HSA-NEXT: s_lshl_b32 s11, s11, 16 +; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-HSA-NEXT: s_lshl_b32 s14, s14, 16 -; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff -; GCN-HSA-NEXT: s_lshl_b32 s23, s23, 16 ; GCN-HSA-NEXT: s_and_b32 s18, s18, 0xffff ; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-HSA-NEXT: s_or_b32 s21, s21, s24 -; GCN-HSA-NEXT: s_or_b32 s7, s7, s25 -; GCN-HSA-NEXT: s_or_b32 s8, s20, s8 -; GCN-HSA-NEXT: s_or_b32 s10, s11, s10 -; GCN-HSA-NEXT: s_or_b32 s11, s19, s13 -; GCN-HSA-NEXT: s_or_b32 s13, s16, s14 -; GCN-HSA-NEXT: s_or_b32 s14, s18, s23 +; GCN-HSA-NEXT: s_or_b32 s16, s22, s16 +; GCN-HSA-NEXT: s_or_b32 s21, s21, s23 +; GCN-HSA-NEXT: s_or_b32 s22, s25, s24 +; GCN-HSA-NEXT: s_or_b32 s20, s20, s26 +; GCN-HSA-NEXT: s_or_b32 s7, s8, s7 +; GCN-HSA-NEXT: s_or_b32 s8, s19, s10 +; GCN-HSA-NEXT: s_or_b32 s10, s13, s11 +; GCN-HSA-NEXT: s_or_b32 s11, s18, s14 ; GCN-HSA-NEXT: s_or_b32 s2, s2, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 @@ -12932,18 +12856,18 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index b4c0b7497b95f..f5f6f2c4c5932 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -1525,6 +1525,7 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; SI-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 +; SI-NEXT: v_mov_b32_e32 v24, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 @@ -1533,21 +1534,20 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v6 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v0 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v5 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v6 -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v6 +; SI-NEXT: ds_write2_b64 v24, v[22:23], v[20:21] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v24, v[18:19], v[16:17] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v24, v[14:15], v[12:13] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v24, v[10:11], v[8:9] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v16i16_to_v16i32: @@ -1794,6 +1794,7 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; SI-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 +; SI-NEXT: v_mov_b32_e32 v24, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v1 ; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v0 @@ -1802,21 +1803,20 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v5 ; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v4 +; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v7 +; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v6 ; SI-NEXT: v_bfe_i32 v8, v1, 0, 16 ; SI-NEXT: v_bfe_i32 v10, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v12, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v14, v2, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v7 -; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v6 ; SI-NEXT: v_bfe_i32 v16, v5, 0, 16 ; SI-NEXT: v_bfe_i32 v18, v4, 0, 16 -; SI-NEXT: v_bfe_i32 v0, v7, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v6, 0, 16 -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1 +; SI-NEXT: v_bfe_i32 v20, v7, 0, 16 +; SI-NEXT: v_bfe_i32 v22, v6, 0, 16 +; SI-NEXT: ds_write2_b64 v24, v[22:23], v[20:21] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v24, v[18:19], v[16:17] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v24, v[14:15], v[12:13] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v24, v[10:11], v[8:9] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v16i16_to_v16i32: @@ -1828,28 +1828,28 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v1 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v0 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v7 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v6 +; VI-NO-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v20, v7, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v22, v6, 0, 16 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v1 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v3 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v2 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v5 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v4 ; VI-NO-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v12, v3, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v7 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v6 -; VI-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v7, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v6, 0, 16 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s0 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v5 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v5, 0, 16 -; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1 +; VI-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[22:23], v[20:21] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[18:19], v[16:17] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[14:15], v[12:13] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[10:11], v[8:9] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v16i16_to_v16i32: @@ -1860,28 +1860,28 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v1 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v0 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v7 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v6 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v7, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v22, v6, 0, 16 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v1 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v3 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v2 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v5 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v4 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v3, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v7 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v6 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v7, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v6, 0, 16 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v5 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v5, 0, 16 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[22:23], v[20:21] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[18:19], v[16:17] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[14:15], v[12:13] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[10:11], v[8:9] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v16i16_to_v16i32: @@ -2003,28 +2003,28 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 ; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v1 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v0 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v7 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v6 +; VI-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v22, v7, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v20, v6, 0, 16 +; VI-DS128-NEXT: v_mov_b32_e32 v0, s0 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v1 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v3 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v2 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4 ; VI-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v14, v3, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6 -; VI-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16 -; VI-DS128-NEXT: v_mov_b32_e32 v4, s0 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5 ; VI-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 -; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48 -; VI-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32 -; VI-DS128-NEXT: ds_write_b128 v4, v[12:15] offset:16 -; VI-DS128-NEXT: ds_write_b128 v4, v[8:11] +; VI-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v0, v[20:23] offset:48 +; VI-DS128-NEXT: ds_write_b128 v0, v[16:19] offset:32 +; VI-DS128-NEXT: ds_write_b128 v0, v[12:15] offset:16 +; VI-DS128-NEXT: ds_write_b128 v0, v[8:11] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v16i16_to_v16i32: @@ -2035,28 +2035,28 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4 ; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v1 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v0 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v7 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v6 +; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v22, v7, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v20, v6, 0, 16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v1 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v3 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v2 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4 ; GFX9-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v14, v3, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6 -; GFX9-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5 ; GFX9-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[12:15] offset:16 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[8:11] +; GFX9-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[20:23] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[16:19] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[12:15] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[8:11] ; GFX9-DS128-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(3) %in %ext = sext <16 x i16> %load to <16 x i32> @@ -2072,54 +2072,53 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(ptr addrspace(3) %out ; SI-NEXT: v_mov_b32_e32 v12, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 -; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3 +; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:6 offset1:7 ; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5 -; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7 +; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:2 offset1:3 +; SI-NEXT: v_mov_b32_e32 v28, s0 ; SI-NEXT: s_waitcnt lgkmcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v2 ; SI-NEXT: s_waitcnt lgkmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; SI-NEXT: ds_write2_b64 v28, v[20:21], v[18:19] offset0:14 offset1:15 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v4 +; SI-NEXT: ds_write2_b64 v28, v[22:23], v[20:21] offset0:12 offset1:13 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: s_waitcnt lgkmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v10 +; SI-NEXT: ds_write2_b64 v28, v[24:25], v[22:23] offset0:10 offset1:11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 ; SI-NEXT: v_and_b32_e32 v24, 0xffff, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v32, s0 -; SI-NEXT: ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15 -; SI-NEXT: ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13 -; SI-NEXT: ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11 -; SI-NEXT: ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9 -; SI-NEXT: ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset1:1 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v8 +; SI-NEXT: ds_write2_b64 v28, v[26:27], v[24:25] offset0:8 offset1:9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; SI-NEXT: ds_write2_b64 v28, v[7:8], v[22:23] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v28, v[9:10], v[20:21] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v28, v[3:4], v[18:19] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v28, v[5:6], v[16:17] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v32i16_to_v32i32: @@ -2127,112 +2126,111 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_mov_b32_e32 v24, s1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v32, s0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v1 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; VI-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v7 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v6 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5 -; VI-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v5 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; VI-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v4 -; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; VI-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v1 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; VI-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v5 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, s1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v12 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_read2_b64 v[8:11], v12 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_read2_b64 v[12:15], v12 offset0:4 offset1:5 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v20, s0 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v5 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; VI-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v3 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; VI-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v7 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[4:5], v[18:19] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v7 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[6:7], v[18:19] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[14:15], v[18:19] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; VI-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v3 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[12:13], v[18:19] offset0:8 offset1:9 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[8:9], v[14:15] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[10:11], v[6:7] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[0:1], v[4:5] offset1:1 +; VI-NO-DS128-NEXT: ds_write2_b64 v20, v[2:3], v[16:17] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v32i16_to_v32i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v24, s1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v32, s0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v1 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v0 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v7 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v6 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v5 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v4 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v1 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v5 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v5 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, s1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v12 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[12:15], v12 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v20, s0 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v5 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v3 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[4:5], v[18:19] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v7 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[6:7], v[18:19] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v11 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[10:11], v[18:19] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v9 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v14 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v14 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[8:9], v[18:19] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v3 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[8:9], v[13:14] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[10:11], v[6:7] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[0:1], v[4:5] offset1:1 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v20, v[2:3], v[16:17] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v32i16_to_v32i32: @@ -2437,108 +2435,111 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(ptr addrspace(3) %out ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_mov_b32_e32 v20, s1 -; VI-DS128-NEXT: ds_read_b128 v[0:3], v20 -; VI-DS128-NEXT: ds_read_b128 v[4:7], v20 offset:16 -; VI-DS128-NEXT: ds_read_b128 v[16:19], v20 offset:32 -; VI-DS128-NEXT: ds_read_b128 v[20:23], v20 offset:48 -; VI-DS128-NEXT: v_mov_b32_e32 v32, s0 +; VI-DS128-NEXT: v_mov_b32_e32 v12, s1 +; VI-DS128-NEXT: ds_read_b128 v[0:3], v12 +; VI-DS128-NEXT: ds_read_b128 v[4:7], v12 offset:48 +; VI-DS128-NEXT: ds_read_b128 v[8:11], v12 offset:16 +; VI-DS128-NEXT: ds_read_b128 v[12:15], v12 offset:32 +; VI-DS128-NEXT: v_mov_b32_e32 v24, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v23 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v22 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v21 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v2 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v4 +; VI-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:96 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v7 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; VI-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v13 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v12 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; VI-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:64 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v15 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v14 +; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v14 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v2 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v19 -; VI-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; VI-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v17 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; VI-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:96 -; VI-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:112 -; VI-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:64 -; VI-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:80 -; VI-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:32 -; VI-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:48 -; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] -; VI-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:16 +; VI-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:80 +; VI-DS128-NEXT: ds_write_b128 v24, v[8:11] offset:32 +; VI-DS128-NEXT: ds_write_b128 v24, v[4:7] offset:48 +; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] +; VI-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v32i16_to_v32i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v20, s1 -; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v20 -; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v20 offset:16 -; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v20 offset:32 -; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v20 offset:48 -; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0 +; GFX9-DS128-NEXT: v_mov_b32_e32 v12, s1 +; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v12 +; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v12 offset:48 +; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v12 offset:32 +; GFX9-DS128-NEXT: ds_read_b128 v[12:15], v12 offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v23 -; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v23 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v22 -; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v22 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v21 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v2 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v4 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:96 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v7 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v9 +; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v9 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v8 +; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v8 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:64 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v14 +; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v14 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v13 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v11 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v2 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v19 -; GFX9-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v18 -; GFX9-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v17 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:96 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:112 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:64 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:80 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] -; GFX9-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:80 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[12:15] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[4:7] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] +; GFX9-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(3) %in %ext = zext <32 x i16> %load to <32 x i32> @@ -2554,54 +2555,54 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out ; SI-NEXT: v_mov_b32_e32 v12, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 -; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3 -; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5 -; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7 +; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:6 offset1:7 +; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:2 offset1:3 +; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:4 offset1:5 +; SI-NEXT: v_mov_b32_e32 v27, s0 ; SI-NEXT: s_waitcnt lgkmcnt(3) -; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v1 -; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v0 -; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v3 -; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v2 -; SI-NEXT: v_bfe_i32 v16, v1, 0, 16 -; SI-NEXT: v_bfe_i32 v18, v0, 0, 16 -; SI-NEXT: v_bfe_i32 v20, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v22, v2, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v1 ; SI-NEXT: s_waitcnt lgkmcnt(2) -; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v5 -; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v4 -; SI-NEXT: v_bfe_i32 v0, v5, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v4, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v7 -; SI-NEXT: v_bfe_i32 v4, v7, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v6 -; SI-NEXT: v_bfe_i32 v6, v6, 0, 16 -; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_ashrrev_i32_e32 v25, 16, v9 -; SI-NEXT: v_bfe_i32 v24, v9, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v8 -; SI-NEXT: v_bfe_i32 v8, v8, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v11 -; SI-NEXT: v_bfe_i32 v26, v11, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v10 -; SI-NEXT: v_bfe_i32 v10, v10, 0, 16 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_ashrrev_i32_e32 v29, 16, v13 -; SI-NEXT: v_bfe_i32 v28, v13, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v12 -; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v15 -; SI-NEXT: v_bfe_i32 v30, v15, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v14 -; SI-NEXT: v_bfe_i32 v14, v14, 0, 16 -; SI-NEXT: v_mov_b32_e32 v32, s0 -; SI-NEXT: ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15 -; SI-NEXT: ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13 -; SI-NEXT: ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11 -; SI-NEXT: ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9 -; SI-NEXT: ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset1:1 +; SI-NEXT: v_ashrrev_i32_e32 v18, 16, v7 +; SI-NEXT: v_ashrrev_i32_e32 v20, 16, v6 +; SI-NEXT: v_bfe_i32 v17, v7, 0, 16 +; SI-NEXT: v_bfe_i32 v19, v6, 0, 16 +; SI-NEXT: ds_write2_b64 v27, v[19:20], v[17:18] offset0:14 offset1:15 +; SI-NEXT: v_ashrrev_i32_e32 v6, 16, v0 +; SI-NEXT: v_ashrrev_i32_e32 v18, 16, v3 +; SI-NEXT: v_ashrrev_i32_e32 v20, 16, v5 +; SI-NEXT: v_ashrrev_i32_e32 v22, 16, v4 +; SI-NEXT: v_bfe_i32 v19, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v21, v4, 0, 16 +; SI-NEXT: ds_write2_b64 v27, v[21:22], v[19:20] offset0:12 offset1:13 +; SI-NEXT: v_ashrrev_i32_e32 v4, 16, v2 +; SI-NEXT: s_waitcnt lgkmcnt(3) +; SI-NEXT: v_ashrrev_i32_e32 v20, 16, v9 +; SI-NEXT: s_waitcnt lgkmcnt(2) +; SI-NEXT: v_ashrrev_i32_e32 v22, 16, v15 +; SI-NEXT: v_ashrrev_i32_e32 v24, 16, v14 +; SI-NEXT: v_bfe_i32 v21, v15, 0, 16 +; SI-NEXT: v_bfe_i32 v23, v14, 0, 16 +; SI-NEXT: ds_write2_b64 v27, v[23:24], v[21:22] offset0:10 offset1:11 +; SI-NEXT: v_ashrrev_i32_e32 v14, 16, v8 +; SI-NEXT: v_ashrrev_i32_e32 v22, 16, v11 +; SI-NEXT: v_ashrrev_i32_e32 v24, 16, v13 +; SI-NEXT: v_ashrrev_i32_e32 v26, 16, v12 +; SI-NEXT: v_bfe_i32 v23, v13, 0, 16 +; SI-NEXT: v_bfe_i32 v25, v12, 0, 16 +; SI-NEXT: ds_write2_b64 v27, v[25:26], v[23:24] offset0:8 offset1:9 +; SI-NEXT: v_ashrrev_i32_e32 v12, 16, v10 +; SI-NEXT: v_bfe_i32 v15, v1, 0, 16 +; SI-NEXT: v_bfe_i32 v5, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v17, v3, 0, 16 +; SI-NEXT: v_bfe_i32 v3, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v19, v9, 0, 16 +; SI-NEXT: v_bfe_i32 v13, v8, 0, 16 +; SI-NEXT: v_bfe_i32 v21, v11, 0, 16 +; SI-NEXT: v_bfe_i32 v11, v10, 0, 16 +; SI-NEXT: ds_write2_b64 v27, v[11:12], v[21:22] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v27, v[13:14], v[19:20] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v27, v[3:4], v[17:18] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v27, v[5:6], v[15:16] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v32i16_to_v32i32: @@ -2609,112 +2610,111 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_mov_b32_e32 v24, s1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v32, s0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v3 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v2 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v1 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v7 -; VI-NO-DS128-NEXT: v_bfe_i32 v8, v3, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v10, v2, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v12, v1, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v6 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v5 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16 -; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5 -; VI-NO-DS128-NEXT: v_bfe_i32 v20, v5, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v4 -; VI-NO-DS128-NEXT: v_bfe_i32 v22, v4, 0, 16 -; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v1 -; VI-NO-DS128-NEXT: v_bfe_i32 v24, v1, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v0 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v5 -; VI-NO-DS128-NEXT: v_bfe_i32 v30, v5, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v4 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v3 -; VI-NO-DS128-NEXT: v_bfe_i32 v26, v3, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v2 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v7 -; VI-NO-DS128-NEXT: v_bfe_i32 v28, v7, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v6 -; VI-NO-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1 -; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, s1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v12 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_read2_b64 v[8:11], v12 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_read2_b64 v[12:15], v12 offset0:4 offset1:5 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v27, s0 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 16, v3 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 16, v5 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 16, v4 +; VI-NO-DS128-NEXT: v_bfe_i32 v17, v5, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v4, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v27, v[19:20], v[17:18] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 16, v7 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 16, v6 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v7, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v21, v6, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v27, v[21:22], v[19:20] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 16, v15 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v24, 16, v14 +; VI-NO-DS128-NEXT: v_bfe_i32 v21, v15, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v23, v14, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 16, v11 +; VI-NO-DS128-NEXT: ds_write2_b64 v27, v[23:24], v[21:22] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 16, v9 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v24, 16, v13 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v26, 16, v12 +; VI-NO-DS128-NEXT: v_bfe_i32 v23, v13, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v25, v12, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v12, 16, v8 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v11, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v21, v9, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v11, v8, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v2 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 16, v1 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v0 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 16, v10 +; VI-NO-DS128-NEXT: ds_write2_b64 v27, v[25:26], v[23:24] offset0:8 offset1:9 +; VI-NO-DS128-NEXT: v_bfe_i32 v15, v3, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v2, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v17, v1, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v5, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v13, v10, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v27, v[11:12], v[21:22] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v27, v[13:14], v[19:20] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v27, v[5:6], v[17:18] offset1:1 +; VI-NO-DS128-NEXT: ds_write2_b64 v27, v[3:4], v[15:16] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v24, s1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v32, s0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v2 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v1 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v7 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v1, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v6 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v5 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v5, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v4 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v22, v4, 0, 16 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v1 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v1, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v0 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v5 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v30, v5, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v4 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v3 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v26, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v2 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v7 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v28, v7, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v6 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, s1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v12 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[12:15], v12 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v28, s0 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v3 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v4 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v4, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v28, v[20:21], v[18:19] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v7 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v6 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v7, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v22, v6, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v28, v[22:23], v[20:21] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v11 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v10 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v10, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v28, v[24:25], v[22:23] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v13 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v9 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v8 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v9, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v26, v8, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 16, v12 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v22, v13, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v7, v12, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v2 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v1 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v0 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v15 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 16, v14 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v28, v[26:27], v[24:25] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v3, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v1, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v15, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v9, v14, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v28, v[7:8], v[22:23] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v28, v[9:10], v[20:21] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v28, v[5:6], v[18:19] offset1:1 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v28, v[3:4], v[16:17] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v32i16_to_v32i32: @@ -2938,112 +2938,111 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_mov_b32_e32 v24, s1 -; VI-DS128-NEXT: ds_read_b128 v[0:3], v24 -; VI-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:16 -; VI-DS128-NEXT: ds_read_b128 v[20:23], v24 offset:32 +; VI-DS128-NEXT: v_mov_b32_e32 v12, s1 +; VI-DS128-NEXT: ds_read_b128 v[0:3], v12 +; VI-DS128-NEXT: ds_read_b128 v[4:7], v12 offset:48 +; VI-DS128-NEXT: ds_read_b128 v[8:11], v12 offset:16 +; VI-DS128-NEXT: ds_read_b128 v[12:15], v12 offset:32 +; VI-DS128-NEXT: v_mov_b32_e32 v31, s0 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v18, 16, v3 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v1 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v0 -; VI-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v14, v1, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v12, v0, 0, 16 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4 -; VI-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; VI-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:48 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v26, 16, v7 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v24, 16, v6 +; VI-DS128-NEXT: v_bfe_i32 v25, v7, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v23, v6, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v31, v[23:26] offset:112 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v26, 16, v23 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v24, 16, v22 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v30, 16, v21 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v28, 16, v20 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v5 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v4 -; VI-DS128-NEXT: v_bfe_i32 v37, v5, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v35, v4, 0, 16 -; VI-DS128-NEXT: v_mov_b32_e32 v4, s0 -; VI-DS128-NEXT: v_bfe_i32 v25, v23, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v23, v22, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v29, v21, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v27, v20, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v7 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v6 -; VI-DS128-NEXT: v_bfe_i32 v33, v7, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v31, v6, 0, 16 -; VI-DS128-NEXT: ds_write_b128 v4, v[35:38] offset:96 -; VI-DS128-NEXT: ds_write_b128 v4, v[31:34] offset:112 -; VI-DS128-NEXT: ds_write_b128 v4, v[27:30] offset:64 -; VI-DS128-NEXT: ds_write_b128 v4, v[23:26] offset:80 -; VI-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32 -; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48 -; VI-DS128-NEXT: ds_write_b128 v4, v[12:15] -; VI-DS128-NEXT: ds_write_b128 v4, v[8:11] offset:16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v26, 16, v13 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v24, 16, v12 +; VI-DS128-NEXT: v_bfe_i32 v25, v13, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v23, v12, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v22, 16, v5 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 16, v4 +; VI-DS128-NEXT: v_bfe_i32 v21, v5, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v19, v4, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v31, v[23:26] offset:64 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v26, 16, v9 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v30, 16, v15 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v28, 16, v14 +; VI-DS128-NEXT: v_bfe_i32 v29, v15, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v27, v14, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v24, 16, v8 +; VI-DS128-NEXT: v_bfe_i32 v25, v9, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v23, v8, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v31, v[19:22] offset:96 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 16, v2 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v22, 16, v1 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 16, v0 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v11 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v10 +; VI-DS128-NEXT: ds_write_b128 v31, v[27:30] offset:80 +; VI-DS128-NEXT: v_bfe_i32 v17, v3, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v21, v1, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v5, v11, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v3, v10, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v31, v[23:26] offset:32 +; VI-DS128-NEXT: ds_write_b128 v31, v[3:6] offset:48 +; VI-DS128-NEXT: ds_write_b128 v31, v[19:22] +; VI-DS128-NEXT: ds_write_b128 v31, v[15:18] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v32i16_to_v32i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s1 -; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v24 -; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:16 -; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v24 offset:32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v12, s1 +; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v12 +; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v12 offset:48 +; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v12 offset:32 +; GFX9-DS128-NEXT: ds_read_b128 v[12:15], v12 offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v3 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v1 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v0 -; GFX9-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v14, v1, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v12, v0, 0, 16 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4 -; GFX9-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:48 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v26, 16, v23 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v24, 16, v22 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v30, 16, v21 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v28, 16, v20 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v5 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v4 -; GFX9-DS128-NEXT: v_bfe_i32 v37, v5, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v35, v4, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-DS128-NEXT: v_bfe_i32 v25, v23, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v23, v22, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v29, v21, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v27, v20, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v7 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v6 -; GFX9-DS128-NEXT: v_bfe_i32 v33, v7, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v31, v6, 0, 16 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[35:38] offset:96 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[31:34] offset:112 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[27:30] offset:64 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[23:26] offset:80 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v4, v[12:15] -; GFX9-DS128-NEXT: ds_write_b128 v4, v[8:11] offset:16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v7 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v6 +; GFX9-DS128-NEXT: v_bfe_i32 v26, v7, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v24, v6, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v9 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v8 +; GFX9-DS128-NEXT: v_bfe_i32 v26, v9, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v24, v8, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v5 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v4 +; GFX9-DS128-NEXT: v_bfe_i32 v22, v5, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v20, v4, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:64 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v13 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v11 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v10 +; GFX9-DS128-NEXT: v_bfe_i32 v30, v11, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v28, v10, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v12 +; GFX9-DS128-NEXT: v_bfe_i32 v26, v13, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v24, v12, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:96 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v2 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v1 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v0 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v15 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v14 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:80 +; GFX9-DS128-NEXT: v_bfe_i32 v18, v3, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v22, v1, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v20, v0, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v5, v15, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v3, v14, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[3:6] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] +; GFX9-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(3) %in %ext = sext <32 x i16> %load to <32 x i32> @@ -3054,349 +3053,301 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_zextload_v64i16_to_v64i32: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s15, 0xe8f000 -; SI-NEXT: s_add_u32 s12, s12, s11 -; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v24, s1 +; SI-NEXT: v_mov_b32_e32 v29, s1 ; SI-NEXT: s_mov_b32 m0, -1 -; SI-NEXT: ds_read2_b64 v[0:3], v24 offset0:8 offset1:9 -; SI-NEXT: ds_read2_b64 v[4:7], v24 offset0:10 offset1:11 -; SI-NEXT: ds_read2_b64 v[12:15], v24 offset0:12 offset1:13 -; SI-NEXT: ds_read2_b64 v[8:11], v24 offset0:14 offset1:15 -; SI-NEXT: ds_read2_b64 v[20:23], v24 offset1:1 -; SI-NEXT: ds_read2_b64 v[16:19], v24 offset0:2 offset1:3 -; SI-NEXT: ds_read2_b64 v[34:37], v24 offset0:4 offset1:5 -; SI-NEXT: ds_read2_b64 v[38:41], v24 offset0:6 offset1:7 -; SI-NEXT: s_waitcnt lgkmcnt(7) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2 -; SI-NEXT: s_waitcnt lgkmcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v1 -; SI-NEXT: buffer_store_dword v24, off, s[12:15], 0 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v0 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v3 -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; SI-NEXT: ds_read2_b64 v[9:12], v29 offset0:6 offset1:7 +; SI-NEXT: ds_read2_b64 v[13:16], v29 offset0:4 offset1:5 +; SI-NEXT: ds_read2_b64 v[4:7], v29 offset0:8 offset1:9 +; SI-NEXT: ds_read2_b64 v[0:3], v29 offset0:10 offset1:11 +; SI-NEXT: v_mov_b32_e32 v8, s0 +; SI-NEXT: ds_read2_b64 v[17:20], v29 offset0:12 offset1:13 +; SI-NEXT: ds_read2_b64 v[21:24], v29 offset0:14 offset1:15 ; SI-NEXT: s_waitcnt lgkmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 -; SI-NEXT: v_and_b32_e32 v42, 0xffff, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt lgkmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 -; SI-NEXT: v_and_b32_e32 v44, 0xffff, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v11 -; SI-NEXT: v_and_b32_e32 v46, 0xffff, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v35, 0xffff, v11 +; SI-NEXT: ds_read2_b64 v[25:28], v29 offset1:1 +; SI-NEXT: ds_read2_b64 v[29:32], v29 offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v8, v[35:36], v[33:34] offset0:14 offset1:15 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; SI-NEXT: v_and_b32_e32 v33, 0xffff, v9 +; SI-NEXT: ds_write2_b64 v8, v[33:34], v[10:11] offset0:12 offset1:13 +; SI-NEXT: s_waitcnt lgkmcnt(8) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: ds_write2_b64 v8, v[11:12], v[9:10] offset0:10 offset1:11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v13 ; SI-NEXT: s_waitcnt lgkmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; SI-NEXT: v_and_b32_e32 v48, 0xffff, v21 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v32 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; SI-NEXT: v_and_b32_e32 v33, 0xffff, v30 +; SI-NEXT: ds_write2_b64 v8, v[9:10], v[14:15] offset0:8 offset1:9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v29 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v29 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v28 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: ds_write2_b64 v8, v[31:32], v[11:12] offset0:6 offset1:7 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v25 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v23 +; SI-NEXT: ds_write2_b64 v8, v[9:10], v[33:34] offset0:4 offset1:5 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_and_b32_e32 v32, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v34, 0xffff, v21 +; SI-NEXT: ds_write2_b64 v8, v[15:16], v[13:14] offset0:2 offset1:3 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; SI-NEXT: v_and_b32_e32 v50, 0xffff, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_waitcnt lgkmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17 -; SI-NEXT: v_and_b32_e32 v52, 0xffff, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v19 -; SI-NEXT: v_and_b32_e32 v54, 0xffff, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v35 -; SI-NEXT: v_and_b32_e32 v56, 0xffff, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 -; SI-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v37 -; SI-NEXT: v_and_b32_e32 v58, 0xffff, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v39 -; SI-NEXT: v_and_b32_e32 v60, 0xffff, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v38 -; SI-NEXT: v_and_b32_e32 v38, 0xffff, v38 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v41 -; SI-NEXT: v_and_b32_e32 v62, 0xffff, v41 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v40 -; SI-NEXT: v_and_b32_e32 v40, 0xffff, v40 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: ds_write2_b64 v0, v[40:41], v[62:63] offset0:14 offset1:15 -; SI-NEXT: ds_write2_b64 v0, v[38:39], v[60:61] offset0:12 offset1:13 -; SI-NEXT: ds_write2_b64 v0, v[36:37], v[58:59] offset0:10 offset1:11 -; SI-NEXT: ds_write2_b64 v0, v[34:35], v[56:57] offset0:8 offset1:9 -; SI-NEXT: ds_write2_b64 v0, v[18:19], v[54:55] offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v0, v[16:17], v[52:53] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v0, v[22:23], v[50:51] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v0, v[20:21], v[48:49] offset1:1 -; SI-NEXT: ds_write2_b64 v0, v[10:11], v[46:47] offset0:30 offset1:31 -; SI-NEXT: ds_write2_b64 v0, v[8:9], v[44:45] offset0:28 offset1:29 -; SI-NEXT: ds_write2_b64 v0, v[14:15], v[42:43] offset0:26 offset1:27 -; SI-NEXT: ds_write2_b64 v0, v[12:13], v[6:7] offset0:24 offset1:25 -; SI-NEXT: ds_write2_b64 v0, v[4:5], v[2:3] offset0:22 offset1:23 -; SI-NEXT: ds_write2_b64 v0, v[24:25], v[32:33] offset0:20 offset1:21 -; SI-NEXT: ds_write2_b64 v0, v[30:31], v[28:29] offset0:18 offset1:19 -; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: ds_write2_b64 v0, v[26:27], v[1:2] offset0:16 offset1:17 +; SI-NEXT: v_and_b32_e32 v36, 0xffff, v19 +; SI-NEXT: ds_write2_b64 v8, v[11:12], v[26:27] offset1:1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; SI-NEXT: v_and_b32_e32 v38, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v40, 0xffff, v17 +; SI-NEXT: ds_write2_b64 v8, v[30:31], v[28:29] offset0:30 offset1:31 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v2 +; SI-NEXT: ds_write2_b64 v8, v[34:35], v[32:33] offset0:28 offset1:29 +; SI-NEXT: ds_write2_b64 v8, v[36:37], v[20:21] offset0:26 offset1:27 +; SI-NEXT: ds_write2_b64 v8, v[40:41], v[38:39] offset0:24 offset1:25 +; SI-NEXT: ds_write2_b64 v8, v[16:17], v[26:27] offset0:22 offset1:23 +; SI-NEXT: ds_write2_b64 v8, v[18:19], v[14:15] offset0:20 offset1:21 +; SI-NEXT: ds_write2_b64 v8, v[12:13], v[22:23] offset0:18 offset1:19 +; SI-NEXT: ds_write2_b64 v8, v[9:10], v[24:25] offset0:16 offset1:17 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 -; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; VI-NO-DS128-NEXT: s_mov_b32 s90, -1 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v16 offset1:1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5 -; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v18 -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10 -; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13 -; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v17 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v18 -; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v17 -; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v20 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v19 -; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v19 -; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:6 offset1:7 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v22 -; VI-NO-DS128-NEXT: v_and_b32_e32 v25, 0xffff, v22 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v28, 16, v21 -; VI-NO-DS128-NEXT: v_and_b32_e32 v27, 0xffff, v21 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v30, 16, v24 -; VI-NO-DS128-NEXT: v_and_b32_e32 v29, 0xffff, v24 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NO-DS128-NEXT: v_and_b32_e32 v31, 0xffff, v23 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v34, 16, v18 -; VI-NO-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v18 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v36, 16, v17 -; VI-NO-DS128-NEXT: v_and_b32_e32 v35, 0xffff, v17 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v38, 16, v20 -; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:8 offset1:9 -; VI-NO-DS128-NEXT: v_and_b32_e32 v37, 0xffff, v20 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v19 -; VI-NO-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v19 -; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:10 offset1:11 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; VI-NO-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v22 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v21 -; VI-NO-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v21 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v24 -; VI-NO-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v24 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v23 -; VI-NO-DS128-NEXT: v_and_b32_e32 v47, 0xffff, v23 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; VI-NO-DS128-NEXT: v_and_b32_e32 v49, 0xffff, v18 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v52, 16, v17 -; VI-NO-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v17 -; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:12 offset1:13 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v56, 16, v19 -; VI-NO-DS128-NEXT: v_and_b32_e32 v55, 0xffff, v19 -; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 -; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; VI-NO-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v20 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; VI-NO-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, s0 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v21 -; VI-NO-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v60, 16, v24 -; VI-NO-DS128-NEXT: v_and_b32_e32 v59, 0xffff, v24 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v24, 16, v23 -; VI-NO-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v62, 16, v17 -; VI-NO-DS128-NEXT: v_and_b32_e32 v61, 0xffff, v17 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; VI-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[0:1], v[19:20] offset0:30 offset1:31 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[16:17], v[61:62] offset0:28 offset1:29 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[23:24], v[59:60] offset0:26 offset1:27 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[21:22], v[57:58] offset0:24 offset1:25 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[55:56], v[53:54] offset0:22 offset1:23 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[51:52], v[49:50] offset0:20 offset1:21 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[47:48], v[45:46] offset0:18 offset1:19 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[43:44], v[41:42] offset0:16 offset1:17 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[39:40], v[37:38] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[35:36], v[33:34] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[31:32], v[29:30] offset0:10 offset1:11 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[27:28], v[25:26] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[14:15], v[12:13] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[10:11], v[8:9] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[6:7], v[4:5] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload -; VI-NO-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload -; VI-NO-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[2:3], v[0:1] offset1:1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v29, s1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[9:12], v29 offset0:14 offset1:15 +; VI-NO-DS128-NEXT: ds_read2_b64 v[13:16], v29 offset0:12 offset1:13 +; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v29 offset1:1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v29 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v29 offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v29 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_read2_b64 v[25:28], v29 offset0:8 offset1:9 +; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v29 offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, s0 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(7) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; VI-NO-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; VI-NO-DS128-NEXT: v_and_b32_e32 v35, 0xffff, v11 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[35:36], v[33:34] offset0:30 offset1:31 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; VI-NO-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v9 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[33:34], v[10:11] offset0:28 offset1:29 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; VI-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v15 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[11:12], v[9:10] offset0:26 offset1:27 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v14 +; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v13 +; VI-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[11:12], v[9:10] offset0:24 offset1:25 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(4) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v32 +; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v31 +; VI-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v31 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[11:12], v[9:10] offset0:22 offset1:23 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v30 +; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v30 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v29 +; VI-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v29 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[11:12], v[9:10] offset0:20 offset1:21 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v28 +; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v28 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v27 +; VI-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v27 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[11:12], v[9:10] offset0:18 offset1:19 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v26 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v25 +; VI-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v25 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; VI-NO-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-NO-DS128-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; VI-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v21 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[11:12], v[9:10] offset0:16 offset1:17 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v24 +; VI-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v24 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[22:23], v[15:16] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; VI-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v20 +; VI-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v19 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[24:25], v[22:23] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; VI-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; VI-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v17 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[26:27], v[24:25] offset0:8 offset1:9 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; VI-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v3 +; VI-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v2 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[13:14], v[11:12] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v5 +; VI-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v4 +; VI-NO-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; VI-NO-DS128-NEXT: v_and_b32_e32 v15, 0xffff, v6 +; VI-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v1 +; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v0 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[24:25], v[22:23] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[18:19], v[20:21] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[15:16], v[13:14] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[11:12], v[9:10] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NO-DS128-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NO-DS128-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NO-DS128-NEXT: s_mov_b32 s14, -1 -; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v56, s1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v56 offset1:1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v56 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[18:21], v56 offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[22:25], v56 offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v15 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v14 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v17 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v15 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v14 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v17 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v16 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v19 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v18 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v18 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v20 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v23 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v23 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v22 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: s_nop 0 -; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v17 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v42, 0xffff, v16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v45, 16, v19 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v44, 0xffff, v19 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v47, 16, v18 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v46, 0xffff, v18 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v48, 0xffff, v21 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v50, 0xffff, v20 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v53, 16, v23 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v52, 0xffff, v23 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v54, 0xffff, v22 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v63, 16, v23 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v62, 0xffff, v23 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v19 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v58, 0xffff, v19 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v21 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v60, 0xffff, v21 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[22:23], v[62:63] offset0:30 offset1:31 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[20:21], v[60:61] offset0:28 offset1:29 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[18:19], v[58:59] offset0:26 offset1:27 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[16:17], v[56:57] offset0:24 offset1:25 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[54:55], v[52:53] offset0:22 offset1:23 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[50:51], v[48:49] offset0:20 offset1:21 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[46:47], v[44:45] offset0:18 offset1:19 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[42:43], v[40:41] offset0:16 offset1:17 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[24:25], v[38:39] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[36:37], v[34:35] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[32:33], v[30:31] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[28:29], v[26:27] offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[14:15], v[12:13] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[10:11], v[8:9] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[6:7], v[4:5] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload -; GFX9-NO-DS128-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GFX9-NO-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v29, s1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[9:12], v29 offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[13:16], v29 offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v29 offset1:1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v29 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[17:20], v29 offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[21:24], v29 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[25:28], v29 offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v29 offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s0 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v12 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v35, 0xffff, v11 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[35:36], v[33:34] offset0:30 offset1:31 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v9 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[33:34], v[10:11] offset0:28 offset1:29 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v16 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v15 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[11:12], v[9:10] offset0:26 offset1:27 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v14 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v13 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[11:12], v[9:10] offset0:24 offset1:25 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v32 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v32 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v31 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v31 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[11:12], v[9:10] offset0:22 offset1:23 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v30 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v30 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v29 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v29 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[11:12], v[9:10] offset0:20 offset1:21 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v28 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v28 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v27 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v27 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[11:12], v[9:10] offset0:18 offset1:19 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v26 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v25 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v25 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v23 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v15, 0xffff, v22 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v21 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[11:12], v[9:10] offset0:16 offset1:17 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v24 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v24 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[22:23], v[15:16] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v20 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v19 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[24:25], v[22:23] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v17 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v17 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[26:27], v[24:25] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v3 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v2 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[13:14], v[11:12] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v5 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v4 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v15, 0xffff, v6 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v1 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v0 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[24:25], v[22:23] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[18:19], v[20:21] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[15:16], v[13:14] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[11:12], v[9:10] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v64i16_to_v64i32: @@ -3795,278 +3746,199 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 -; VI-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; VI-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; VI-DS128-NEXT: s_mov_b32 s90, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 -; VI-DS128-NEXT: ds_read_b128 v[8:11], v0 -; VI-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 -; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 -; VI-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 -; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 +; VI-DS128-NEXT: v_mov_b32_e32 v33, s1 +; VI-DS128-NEXT: ds_read_b128 v[5:8], v33 offset:112 +; VI-DS128-NEXT: ds_read_b128 v[9:12], v33 offset:96 +; VI-DS128-NEXT: ds_read_b128 v[13:16], v33 +; VI-DS128-NEXT: ds_read_b128 v[0:3], v33 offset:16 +; VI-DS128-NEXT: ds_read_b128 v[21:24], v33 offset:32 +; VI-DS128-NEXT: ds_read_b128 v[25:28], v33 offset:48 +; VI-DS128-NEXT: ds_read_b128 v[29:32], v33 offset:64 +; VI-DS128-NEXT: ds_read_b128 v[33:36], v33 offset:80 +; VI-DS128-NEXT: v_mov_b32_e32 v4, s0 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(7) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; VI-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v6 +; VI-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v5 +; VI-DS128-NEXT: ds_write_b128 v4, v[17:20] offset:224 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; VI-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v8 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; VI-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(7) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v10 +; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; VI-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:192 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; VI-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; VI-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:208 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; VI-DS128-NEXT: v_mov_b32_e32 v4, v3 -; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11 -; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v9 -; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 -; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17 -; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v23 -; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22 -; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 -; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 -; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 -; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 -; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 -; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 -; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 -; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 -; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill -; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v25 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v52, 16, v24 -; VI-DS128-NEXT: v_and_b32_e32 v49, 0xffff, v27 -; VI-DS128-NEXT: v_and_b32_e32 v47, 0xffff, v26 -; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 -; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 -; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 -; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 -; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 -; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 -; VI-DS128-NEXT: v_mov_b32_e32 v24, s0 -; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 -; VI-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v56 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v55 -; VI-DS128-NEXT: v_and_b32_e32 v60, 0xffff, v58 -; VI-DS128-NEXT: v_and_b32_e32 v58, 0xffff, v57 -; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v56 -; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v55 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v27 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v26 -; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v27 -; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v26 -; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:224 -; VI-DS128-NEXT: ds_write_b128 v24, v[4:7] offset:240 -; VI-DS128-NEXT: ds_write_b128 v24, v[8:11] offset:192 -; VI-DS128-NEXT: ds_write_b128 v24, v[58:61] offset:208 -; VI-DS128-NEXT: ds_write_b128 v24, v[51:54] offset:160 -; VI-DS128-NEXT: ds_write_b128 v24, v[47:50] offset:176 -; VI-DS128-NEXT: ds_write_b128 v24, v[43:46] offset:128 -; VI-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144 -; VI-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96 -; VI-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112 -; VI-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:64 -; VI-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:80 -; VI-DS128-NEXT: ds_write_b128 v24, v[12:15] offset:32 -; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload -; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:36 ; 4-byte Folded Reload -; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:40 ; 4-byte Folded Reload -; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:44 ; 4-byte Folded Reload -; VI-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 -; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload -; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload -; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload -; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload -; VI-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] -; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload -; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload -; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload -; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload -; VI-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v34 +; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v34 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v33 +; VI-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; VI-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:160 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v36 +; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v36 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v35 +; VI-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v35 +; VI-DS128-NEXT: ds_write_b128 v4, v[17:20] offset:240 +; VI-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:176 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v30 +; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v30 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v29 +; VI-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v29 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v26 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v25 +; VI-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v26 +; VI-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v25 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v28 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v27 +; VI-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v28 +; VI-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; VI-DS128-NEXT: v_and_b32_e32 v27, 0xffff, v22 +; VI-DS128-NEXT: v_and_b32_e32 v25, 0xffff, v21 +; VI-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:128 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v32 +; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v31 +; VI-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v31 +; VI-DS128-NEXT: ds_write_b128 v4, v[25:28] offset:64 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v24 +; VI-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v23 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; VI-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v1 +; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v0 +; VI-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:144 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v16 +; VI-DS128-NEXT: ds_write_b128 v4, v[9:12] offset:96 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v14 +; VI-DS128-NEXT: ds_write_b128 v4, v[17:20] offset:112 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; VI-DS128-NEXT: ds_write_b128 v4, v[26:29] offset:80 +; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; VI-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; VI-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v14 +; VI-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v13 +; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v2 +; VI-DS128-NEXT: ds_write_b128 v4, v[22:25] offset:32 +; VI-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:48 +; VI-DS128-NEXT: ds_write_b128 v4, v[9:12] +; VI-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v64i16_to_v64i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-DS128-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DS128-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DS128-NEXT: s_mov_b32 s14, -1 -; GFX9-DS128-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v0 -; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 -; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 -; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 +; GFX9-DS128-NEXT: v_mov_b32_e32 v33, s1 +; GFX9-DS128-NEXT: ds_read_b128 v[5:8], v33 offset:112 +; GFX9-DS128-NEXT: ds_read_b128 v[9:12], v33 offset:96 +; GFX9-DS128-NEXT: ds_read_b128 v[13:16], v33 +; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v33 offset:16 +; GFX9-DS128-NEXT: ds_read_b128 v[21:24], v33 offset:32 +; GFX9-DS128-NEXT: ds_read_b128 v[25:28], v33 offset:48 +; GFX9-DS128-NEXT: ds_read_b128 v[29:32], v33 offset:64 +; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v33 offset:80 +; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX9-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v6 +; GFX9-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v5 +; GFX9-DS128-NEXT: ds_write_b128 v4, v[17:20] offset:224 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; GFX9-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v8 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; GFX9-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v10 +; GFX9-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v10 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; GFX9-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:192 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; GFX9-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; GFX9-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; GFX9-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:208 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 -; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11 -; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_nop 0 -; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v9 -; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_nop 0 -; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 -; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17 -; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v20 -; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v23 -; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22 -; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 -; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 -; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 -; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 -; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 -; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 -; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 -; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_nop 0 -; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v25 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v52, 16, v24 -; GFX9-DS128-NEXT: v_and_b32_e32 v49, 0xffff, v27 -; GFX9-DS128-NEXT: v_and_b32_e32 v47, 0xffff, v26 -; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 -; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 -; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 -; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 -; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 -; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0 -; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 -; GFX9-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v56 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v55 -; GFX9-DS128-NEXT: v_and_b32_e32 v60, 0xffff, v58 -; GFX9-DS128-NEXT: v_and_b32_e32 v58, 0xffff, v57 -; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v56 -; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v55 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v27 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v26 -; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v27 -; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v26 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:224 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[4:7] offset:240 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[8:11] offset:192 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[58:61] offset:208 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[51:54] offset:160 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[47:50] offset:176 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[43:46] offset:128 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:64 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:80 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[12:15] offset:32 -; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 -; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] -; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v34 +; GFX9-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v34 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v33 +; GFX9-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v33 +; GFX9-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:160 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v36 +; GFX9-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v36 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v35 +; GFX9-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v35 +; GFX9-DS128-NEXT: ds_write_b128 v4, v[17:20] offset:240 +; GFX9-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:176 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v30 +; GFX9-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v30 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v29 +; GFX9-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v29 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v26 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v25 +; GFX9-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v26 +; GFX9-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v25 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v28 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v27 +; GFX9-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v28 +; GFX9-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v27 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v21 +; GFX9-DS128-NEXT: v_and_b32_e32 v27, 0xffff, v22 +; GFX9-DS128-NEXT: v_and_b32_e32 v25, 0xffff, v21 +; GFX9-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:128 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v32 +; GFX9-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v32 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v31 +; GFX9-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v31 +; GFX9-DS128-NEXT: ds_write_b128 v4, v[25:28] offset:64 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v1 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v23 +; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v24 +; GFX9-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v23 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX9-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v1 +; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v0 +; GFX9-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:144 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v16 +; GFX9-DS128-NEXT: ds_write_b128 v4, v[9:12] offset:96 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v14 +; GFX9-DS128-NEXT: ds_write_b128 v4, v[17:20] offset:112 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GFX9-DS128-NEXT: ds_write_b128 v4, v[26:29] offset:80 +; GFX9-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; GFX9-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; GFX9-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v14 +; GFX9-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v13 +; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v2 +; GFX9-DS128-NEXT: ds_write_b128 v4, v[22:25] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v4, v[9:12] +; GFX9-DS128-NEXT: ds_write_b128 v4, v[5:8] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <64 x i16>, ptr addrspace(3) %in %ext = zext <64 x i16> %load to <64 x i32> @@ -4077,343 +3949,311 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 { ; SI-LABEL: local_sextload_v64i16_to_v64i32: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s15, 0xe8f000 -; SI-NEXT: s_add_u32 s12, s12, s11 -; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, s1 +; SI-NEXT: v_mov_b32_e32 v16, s1 ; SI-NEXT: s_mov_b32 m0, -1 -; SI-NEXT: ds_read2_b64 v[4:7], v20 offset0:8 offset1:9 -; SI-NEXT: ds_read2_b64 v[0:3], v20 offset0:10 offset1:11 -; SI-NEXT: ds_read2_b64 v[8:11], v20 offset0:12 offset1:13 -; SI-NEXT: ds_read2_b64 v[12:15], v20 offset0:14 offset1:15 -; SI-NEXT: ds_read2_b64 v[16:19], v20 offset1:1 -; SI-NEXT: ds_read2_b64 v[30:33], v20 offset0:2 offset1:3 -; SI-NEXT: ds_read2_b64 v[34:37], v20 offset0:4 offset1:5 -; SI-NEXT: ds_read2_b64 v[38:41], v20 offset0:6 offset1:7 -; SI-NEXT: s_waitcnt lgkmcnt(7) -; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v5 -; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v4 -; SI-NEXT: v_ashrrev_i32_e32 v25, 16, v7 -; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v6 -; SI-NEXT: s_waitcnt lgkmcnt(6) -; SI-NEXT: v_ashrrev_i32_e32 v29, 16, v1 -; SI-NEXT: v_bfe_i32 v20, v5, 0, 16 -; SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; SI-NEXT: v_bfe_i32 v22, v4, 0, 16 -; SI-NEXT: v_bfe_i32 v24, v7, 0, 16 -; SI-NEXT: v_bfe_i32 v26, v6, 0, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v0 -; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v3 -; SI-NEXT: v_bfe_i32 v28, v1, 0, 16 -; SI-NEXT: v_bfe_i32 v20, v0, 0, 16 -; SI-NEXT: v_bfe_i32 v6, v3, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v2 -; SI-NEXT: v_bfe_i32 v4, v2, 0, 16 +; SI-NEXT: ds_read2_b64 v[21:24], v16 offset0:6 offset1:7 +; SI-NEXT: ds_read2_b64 v[12:15], v16 offset0:4 offset1:5 +; SI-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 +; SI-NEXT: ds_read2_b64 v[0:3], v16 offset1:1 +; SI-NEXT: v_mov_b32_e32 v20, s0 +; SI-NEXT: ds_read2_b64 v[25:28], v16 offset0:14 offset1:15 +; SI-NEXT: ds_read2_b64 v[29:32], v16 offset0:12 offset1:13 ; SI-NEXT: s_waitcnt lgkmcnt(5) -; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v9 -; SI-NEXT: v_bfe_i32 v2, v9, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v8 -; SI-NEXT: v_bfe_i32 v8, v8, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v43, 16, v11 -; SI-NEXT: v_bfe_i32 v42, v11, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v10 -; SI-NEXT: v_bfe_i32 v10, v10, 0, 16 -; SI-NEXT: s_waitcnt lgkmcnt(4) -; SI-NEXT: v_ashrrev_i32_e32 v45, 16, v13 -; SI-NEXT: v_bfe_i32 v44, v13, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v12 -; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v47, 16, v15 -; SI-NEXT: v_bfe_i32 v46, v15, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v34, 16, v24 +; SI-NEXT: v_ashrrev_i32_e32 v36, 16, v23 +; SI-NEXT: v_bfe_i32 v33, v24, 0, 16 +; SI-NEXT: v_bfe_i32 v35, v23, 0, 16 +; SI-NEXT: ds_read2_b64 v[8:11], v16 offset0:10 offset1:11 +; SI-NEXT: ds_read2_b64 v[16:19], v16 offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v20, v[35:36], v[33:34] offset0:14 offset1:15 +; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v22 +; SI-NEXT: v_bfe_i32 v22, v22, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v34, 16, v21 +; SI-NEXT: v_bfe_i32 v33, v21, 0, 16 +; SI-NEXT: ds_write2_b64 v20, v[33:34], v[22:23] offset0:12 offset1:13 +; SI-NEXT: s_waitcnt lgkmcnt(8) +; SI-NEXT: v_ashrrev_i32_e32 v22, 16, v15 +; SI-NEXT: v_bfe_i32 v21, v15, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v14 ; SI-NEXT: v_bfe_i32 v14, v14, 0, 16 -; SI-NEXT: s_waitcnt lgkmcnt(3) -; SI-NEXT: v_ashrrev_i32_e32 v49, 16, v17 -; SI-NEXT: v_bfe_i32 v48, v17, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v16 -; SI-NEXT: v_bfe_i32 v16, v16, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v51, 16, v19 -; SI-NEXT: v_bfe_i32 v50, v19, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v18 -; SI-NEXT: v_bfe_i32 v18, v18, 0, 16 -; SI-NEXT: s_waitcnt lgkmcnt(2) -; SI-NEXT: v_ashrrev_i32_e32 v53, 16, v31 -; SI-NEXT: v_bfe_i32 v52, v31, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v30 -; SI-NEXT: v_bfe_i32 v30, v30, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v55, 16, v33 -; SI-NEXT: v_bfe_i32 v54, v33, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v33, 16, v32 -; SI-NEXT: v_bfe_i32 v32, v32, 0, 16 -; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_ashrrev_i32_e32 v57, 16, v35 -; SI-NEXT: v_bfe_i32 v56, v35, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v35, 16, v34 -; SI-NEXT: v_bfe_i32 v34, v34, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v59, 16, v37 -; SI-NEXT: v_bfe_i32 v58, v37, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v37, 16, v36 -; SI-NEXT: v_bfe_i32 v36, v36, 0, 16 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_ashrrev_i32_e32 v61, 16, v39 -; SI-NEXT: v_bfe_i32 v60, v39, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v39, 16, v38 -; SI-NEXT: v_bfe_i32 v38, v38, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v63, 16, v41 -; SI-NEXT: v_bfe_i32 v62, v41, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v41, 16, v40 -; SI-NEXT: v_bfe_i32 v40, v40, 0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: ds_write2_b64 v0, v[40:41], v[62:63] offset0:14 offset1:15 -; SI-NEXT: ds_write2_b64 v0, v[38:39], v[60:61] offset0:12 offset1:13 -; SI-NEXT: ds_write2_b64 v0, v[36:37], v[58:59] offset0:10 offset1:11 -; SI-NEXT: ds_write2_b64 v0, v[34:35], v[56:57] offset0:8 offset1:9 -; SI-NEXT: ds_write2_b64 v0, v[32:33], v[54:55] offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v0, v[30:31], v[52:53] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v0, v[18:19], v[50:51] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v0, v[16:17], v[48:49] offset1:1 -; SI-NEXT: ds_write2_b64 v0, v[14:15], v[46:47] offset0:30 offset1:31 -; SI-NEXT: ds_write2_b64 v0, v[12:13], v[44:45] offset0:28 offset1:29 -; SI-NEXT: ds_write2_b64 v0, v[10:11], v[42:43] offset0:26 offset1:27 -; SI-NEXT: ds_write2_b64 v0, v[8:9], v[2:3] offset0:24 offset1:25 -; SI-NEXT: ds_write2_b64 v0, v[4:5], v[6:7] offset0:22 offset1:23 -; SI-NEXT: ds_write2_b64 v0, v[20:21], v[28:29] offset0:20 offset1:21 -; SI-NEXT: ds_write2_b64 v0, v[26:27], v[24:25] offset0:18 offset1:19 -; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: ds_write2_b64 v0, v[22:23], v[1:2] offset0:16 offset1:17 +; SI-NEXT: v_ashrrev_i32_e32 v24, 16, v13 +; SI-NEXT: v_bfe_i32 v23, v13, 0, 16 +; SI-NEXT: ds_write2_b64 v20, v[14:15], v[21:22] offset0:10 offset1:11 +; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v12 +; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 +; SI-NEXT: s_waitcnt lgkmcnt(8) +; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v7 +; SI-NEXT: v_bfe_i32 v14, v7, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v6 +; SI-NEXT: v_bfe_i32 v6, v6, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v22, 16, v5 +; SI-NEXT: v_bfe_i32 v21, v5, 0, 16 +; SI-NEXT: ds_write2_b64 v20, v[12:13], v[23:24] offset0:8 offset1:9 +; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v4 +; SI-NEXT: v_bfe_i32 v4, v4, 0, 16 +; SI-NEXT: s_waitcnt lgkmcnt(8) +; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v3 +; SI-NEXT: v_bfe_i32 v12, v3, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v2 +; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v24, 16, v1 +; SI-NEXT: v_bfe_i32 v23, v1, 0, 16 +; SI-NEXT: ds_write2_b64 v20, v[6:7], v[14:15] offset0:6 offset1:7 +; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v0 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; SI-NEXT: s_waitcnt lgkmcnt(5) +; SI-NEXT: v_ashrrev_i32_e32 v6, 16, v17 +; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v28 +; SI-NEXT: v_ashrrev_i32_e32 v34, 16, v27 +; SI-NEXT: v_bfe_i32 v14, v28, 0, 16 +; SI-NEXT: v_bfe_i32 v33, v27, 0, 16 +; SI-NEXT: ds_write2_b64 v20, v[4:5], v[21:22] offset0:4 offset1:5 +; SI-NEXT: v_ashrrev_i32_e32 v4, 16, v16 +; SI-NEXT: v_ashrrev_i32_e32 v22, 16, v19 +; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v26 +; SI-NEXT: v_ashrrev_i32_e32 v36, 16, v25 +; SI-NEXT: v_bfe_i32 v26, v26, 0, 16 +; SI-NEXT: v_bfe_i32 v35, v25, 0, 16 +; SI-NEXT: ds_write2_b64 v20, v[2:3], v[12:13] offset0:2 offset1:3 +; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v18 +; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v9 +; SI-NEXT: v_ashrrev_i32_e32 v38, 16, v32 +; SI-NEXT: v_ashrrev_i32_e32 v40, 16, v31 +; SI-NEXT: v_bfe_i32 v37, v32, 0, 16 +; SI-NEXT: v_bfe_i32 v39, v31, 0, 16 +; SI-NEXT: ds_write2_b64 v20, v[0:1], v[23:24] offset1:1 +; SI-NEXT: v_ashrrev_i32_e32 v24, 16, v8 +; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v11 +; SI-NEXT: v_ashrrev_i32_e32 v42, 16, v30 +; SI-NEXT: v_ashrrev_i32_e32 v44, 16, v29 +; SI-NEXT: v_bfe_i32 v41, v30, 0, 16 +; SI-NEXT: v_bfe_i32 v43, v29, 0, 16 +; SI-NEXT: ds_write2_b64 v20, v[33:34], v[14:15] offset0:30 offset1:31 +; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v10 +; SI-NEXT: v_bfe_i32 v5, v17, 0, 16 +; SI-NEXT: v_bfe_i32 v3, v16, 0, 16 +; SI-NEXT: v_bfe_i32 v21, v19, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v18, 0, 16 +; SI-NEXT: v_bfe_i32 v12, v9, 0, 16 +; SI-NEXT: v_bfe_i32 v23, v8, 0, 16 +; SI-NEXT: v_bfe_i32 v30, v11, 0, 16 +; SI-NEXT: v_bfe_i32 v14, v10, 0, 16 +; SI-NEXT: ds_write2_b64 v20, v[35:36], v[26:27] offset0:28 offset1:29 +; SI-NEXT: ds_write2_b64 v20, v[39:40], v[37:38] offset0:26 offset1:27 +; SI-NEXT: ds_write2_b64 v20, v[43:44], v[41:42] offset0:24 offset1:25 +; SI-NEXT: ds_write2_b64 v20, v[14:15], v[30:31] offset0:22 offset1:23 +; SI-NEXT: ds_write2_b64 v20, v[23:24], v[12:13] offset0:20 offset1:21 +; SI-NEXT: ds_write2_b64 v20, v[1:2], v[21:22] offset0:18 offset1:19 +; SI-NEXT: ds_write2_b64 v20, v[3:4], v[5:6] offset0:16 offset1:17 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 -; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 -; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; VI-NO-DS128-NEXT: s_mov_b32 s90, -1 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_mov_b32_e32 v28, s1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 -; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000 -; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30 -; VI-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29 -; VI-NO-DS128-NEXT: v_bfe_i32 v26, v29, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v32 -; VI-NO-DS128-NEXT: v_bfe_i32 v37, v32, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31 -; VI-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16 -; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34 -; VI-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33 -; VI-NO-DS128-NEXT: v_bfe_i32 v43, v33, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v46, 16, v36 -; VI-NO-DS128-NEXT: v_bfe_i32 v45, v36, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v48, 16, v35 -; VI-NO-DS128-NEXT: v_bfe_i32 v47, v35, 0, 16 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v50, 16, v30 -; VI-NO-DS128-NEXT: v_bfe_i32 v49, v30, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v52, 16, v29 -; VI-NO-DS128-NEXT: v_bfe_i32 v51, v29, 0, 16 -; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:12 offset1:13 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31 -; VI-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16 -; VI-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15 -; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 -; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32 -; VI-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31 -; VI-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v30, s0 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v17 -; VI-NO-DS128-NEXT: v_bfe_i32 v8, v15, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v10, v14, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v12, v17, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v16 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v16, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v21 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v21, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v20 -; VI-NO-DS128-NEXT: v_bfe_i32 v18, v20, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v23 -; VI-NO-DS128-NEXT: v_bfe_i32 v20, v23, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v22 -; VI-NO-DS128-NEXT: v_bfe_i32 v22, v22, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v58, 16, v34 -; VI-NO-DS128-NEXT: v_bfe_i32 v57, v34, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v33 -; VI-NO-DS128-NEXT: v_bfe_i32 v33, v33, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v36 -; VI-NO-DS128-NEXT: v_bfe_i32 v59, v36, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v35 -; VI-NO-DS128-NEXT: v_bfe_i32 v35, v35, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v29 -; VI-NO-DS128-NEXT: v_bfe_i32 v61, v29, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v28 -; VI-NO-DS128-NEXT: v_bfe_i32 v28, v28, 0, 16 -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[0:1], v[31:32] offset0:30 offset1:31 -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[28:29], v[61:62] offset0:28 offset1:29 -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[35:36], v[59:60] offset0:26 offset1:27 -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[33:34], v[57:58] offset0:24 offset1:25 -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[55:56], v[53:54] offset0:22 offset1:23 -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[51:52], v[49:50] offset0:20 offset1:21 -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[47:48], v[45:46] offset0:18 offset1:19 -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[43:44], v[41:42] offset0:16 offset1:17 -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[39:40], v[37:38] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[26:27], v[24:25] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[22:23], v[20:21] offset0:10 offset1:11 -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[18:19], v[16:17] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[14:15], v[12:13] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[10:11], v[8:9] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[6:7], v[4:5] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload -; VI-NO-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload -; VI-NO-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[2:3], v[0:1] offset1:1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v29, s1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[1:4], v29 offset0:14 offset1:15 +; VI-NO-DS128-NEXT: ds_read2_b64 v[5:8], v29 offset0:12 offset1:13 +; VI-NO-DS128-NEXT: ds_read2_b64 v[9:12], v29 offset0:10 offset1:11 +; VI-NO-DS128-NEXT: ds_read2_b64 v[13:16], v29 offset0:8 offset1:9 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 +; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v29 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v29 offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_read2_b64 v[25:28], v29 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v29 offset1:1 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(7) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v4 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v3 +; VI-NO-DS128-NEXT: v_bfe_i32 v33, v4, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v35, v3, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[35:36], v[33:34] offset0:30 offset1:31 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v2 +; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v1 +; VI-NO-DS128-NEXT: v_bfe_i32 v33, v1, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[33:34], v[2:3] offset0:28 offset1:29 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v8 +; VI-NO-DS128-NEXT: v_bfe_i32 v1, v8, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v7 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v7, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:26 offset1:27 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v6 +; VI-NO-DS128-NEXT: v_bfe_i32 v1, v6, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v5 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v5, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:24 offset1:25 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(9) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v12 +; VI-NO-DS128-NEXT: v_bfe_i32 v1, v12, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v11 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v11, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:22 offset1:23 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v10 +; VI-NO-DS128-NEXT: v_bfe_i32 v1, v10, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v9 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v9, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:20 offset1:21 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(10) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v16 +; VI-NO-DS128-NEXT: v_bfe_i32 v1, v16, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v15 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v15, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:18 offset1:19 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v14 +; VI-NO-DS128-NEXT: v_bfe_i32 v1, v14, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v13 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v13, 0, 16 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(10) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 16, v18 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 16, v17 +; VI-NO-DS128-NEXT: v_bfe_i32 v7, v18, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v9, v17, 0, 16 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(9) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v12, 16, v24 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 16, v23 +; VI-NO-DS128-NEXT: v_bfe_i32 v11, v24, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v13, v23, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 16, v22 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 16, v21 +; VI-NO-DS128-NEXT: v_bfe_i32 v15, v22, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v17, v21, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:16 offset1:17 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v20 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v19 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v20, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v5, v19, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[13:14], v[11:12] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(10) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 16, v28 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[17:18], v[15:16] offset0:8 offset1:9 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 16, v27 +; VI-NO-DS128-NEXT: v_bfe_i32 v13, v28, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v15, v27, 0, 16 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(10) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v30 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v29 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v32 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[9:10], v[7:8] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 16, v31 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 16, v26 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v12, 16, v25 +; VI-NO-DS128-NEXT: v_bfe_i32 v1, v30, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v29, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v5, v32, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v7, v31, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v9, v26, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v11, v25, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[15:16], v[13:14] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[11:12], v[9:10] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NO-DS128-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NO-DS128-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NO-DS128-NEXT: s_mov_b32 s14, -1 -; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v28, s1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 -; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v26, v29, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v32 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v37, v32, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v43, v33, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v46, 16, v36 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v45, v36, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v48, 16, v35 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v47, v35, 0, 16 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v50, 16, v30 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v49, v30, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v52, 16, v29 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v51, v29, 0, 16 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 -; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: s_nop 0 -; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v30, s0 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v17 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v15, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v14, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v17, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v16, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v21 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v21, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v20 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v20, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v23 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v23, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v22 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v22, v22, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v58, 16, v34 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v57, v34, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v33 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v33, v33, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v36 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v59, v36, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v35 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v35, v35, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v29 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v61, v29, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v28 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v28, v28, 0, 16 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[0:1], v[31:32] offset0:30 offset1:31 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[28:29], v[61:62] offset0:28 offset1:29 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[35:36], v[59:60] offset0:26 offset1:27 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[33:34], v[57:58] offset0:24 offset1:25 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[55:56], v[53:54] offset0:22 offset1:23 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[51:52], v[49:50] offset0:20 offset1:21 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[47:48], v[45:46] offset0:18 offset1:19 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[43:44], v[41:42] offset0:16 offset1:17 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[39:40], v[37:38] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[26:27], v[24:25] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[22:23], v[20:21] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[18:19], v[16:17] offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[14:15], v[12:13] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[10:11], v[8:9] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[6:7], v[4:5] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GFX9-NO-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GFX9-NO-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[2:3], v[0:1] offset1:1 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v29, s1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[1:4], v29 offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[5:8], v29 offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[9:12], v29 offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[13:16], v29 offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[17:20], v29 offset1:1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[21:24], v29 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[25:28], v29 offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v29 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v4 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v3 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v33, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v35, v3, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[35:36], v[33:34] offset0:30 offset1:31 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v2 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v1 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v33, v1, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[33:34], v[2:3] offset0:28 offset1:29 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v8 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v8, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v7 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v7, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:26 offset1:27 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v6 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v6, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v5 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v5, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:24 offset1:25 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(9) +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v12 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v12, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v11 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v11, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:22 offset1:23 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v10 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v10, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v9 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v9, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:20 offset1:21 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(10) +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v16, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v15 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v15, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:18 offset1:19 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v14 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v14, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v13 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v13, 0, 16 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v12, 16, v28 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 16, v27 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v11, v28, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v13, v27, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 16, v26 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v25 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v15, v26, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v26, v25, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:16 offset1:17 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v32 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v31 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v32, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v31, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 16, v30 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 16, v29 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v7, v30, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v9, v29, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[13:14], v[11:12] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 16, v24 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[26:27], v[15:16] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 16, v23 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v13, v24, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v15, v23, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v18 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v17 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v20 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[9:10], v[7:8] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 16, v19 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 16, v22 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v12, 16, v21 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v18, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v17, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v20, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v7, v19, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v9, v22, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v11, v21, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[15:16], v[13:14] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[11:12], v[9:10] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v64i16_to_v64i32: @@ -4845,258 +4685,207 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-LABEL: local_sextload_v64i16_to_v64i32: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-DS128-NEXT: s_mov_b32 m0, -1 -; VI-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 -; VI-DS128-NEXT: s_mov_b32 s90, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_mov_b32_e32 v32, s1 -; VI-DS128-NEXT: ds_read_b128 v[8:11], v32 -; VI-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 -; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 -; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 -; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 -; VI-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10 -; VI-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v10, 0, 16 -; VI-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v9 +; VI-DS128-NEXT: v_mov_b32_e32 v33, s1 +; VI-DS128-NEXT: ds_read_b128 v[1:4], v33 offset:112 +; VI-DS128-NEXT: ds_read_b128 v[5:8], v33 offset:96 +; VI-DS128-NEXT: ds_read_b128 v[9:12], v33 offset:80 +; VI-DS128-NEXT: ds_read_b128 v[13:16], v33 offset:64 +; VI-DS128-NEXT: v_mov_b32_e32 v0, s0 +; VI-DS128-NEXT: ds_read_b128 v[21:24], v33 offset:48 +; VI-DS128-NEXT: ds_read_b128 v[25:28], v33 offset:32 +; VI-DS128-NEXT: ds_read_b128 v[29:32], v33 offset:16 +; VI-DS128-NEXT: ds_read_b128 v[33:36], v33 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(7) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 16, v2 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v18, 16, v1 +; VI-DS128-NEXT: v_bfe_i32 v19, v2, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v17, v1, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v0, v[17:20] offset:224 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 16, v4 +; VI-DS128-NEXT: v_bfe_i32 v19, v4, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v18, 16, v3 +; VI-DS128-NEXT: v_bfe_i32 v17, v3, 0, 16 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(7) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v6 +; VI-DS128-NEXT: v_bfe_i32 v3, v6, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v5 +; VI-DS128-NEXT: v_bfe_i32 v1, v5, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v0, v[1:4] offset:192 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8 -; VI-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v17 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v16 -; VI-DS128-NEXT: v_bfe_i32 v10, v19, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v8, v18, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v14, v17, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v12, v16, 0, 16 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v27 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v26 -; VI-DS128-NEXT: v_bfe_i32 v18, v27, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v16, v26, 0, 16 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v36 -; VI-DS128-NEXT: v_bfe_i32 v26, v36, 0, 16 -; VI-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 -; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 -; VI-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 -; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v37 -; VI-DS128-NEXT: v_bfe_i32 v46, v39, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v44, v38, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40 -; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 -; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 -; VI-DS128-NEXT: v_mov_b32_e32 v32, s0 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 -; VI-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 -; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 -; VI-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v30, v34, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v28, v33, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v49, 16, v36 -; VI-DS128-NEXT: v_bfe_i32 v48, v36, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v43 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v42 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v55, 16, v41 -; VI-DS128-NEXT: v_bfe_i32 v35, v43, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v33, v42, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v54, v41, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v59 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v58 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v57 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v56 -; VI-DS128-NEXT: v_bfe_i32 v61, v59, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v59, v58, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v6, v57, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v4, v56, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v43, 16, v40 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v41, 16, v39 -; VI-DS128-NEXT: v_bfe_i32 v42, v40, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v40, v39, 0, 16 -; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:224 -; VI-DS128-NEXT: ds_write_b128 v32, v[40:43] offset:240 -; VI-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:192 -; VI-DS128-NEXT: ds_write_b128 v32, v[59:62] offset:208 -; VI-DS128-NEXT: ds_write_b128 v32, v[52:55] offset:160 -; VI-DS128-NEXT: ds_write_b128 v32, v[33:36] offset:176 -; VI-DS128-NEXT: ds_write_b128 v32, v[48:51] offset:128 -; VI-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144 -; VI-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96 -; VI-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112 -; VI-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:64 -; VI-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80 -; VI-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:32 -; VI-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:48 -; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload -; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload -; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload -; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload -; VI-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] -; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload -; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload -; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload -; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload -; VI-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v7 +; VI-DS128-NEXT: v_bfe_i32 v1, v7, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v0, v[1:4] offset:208 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(8) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v10 +; VI-DS128-NEXT: v_bfe_i32 v3, v10, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v9 +; VI-DS128-NEXT: v_bfe_i32 v1, v9, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v0, v[1:4] offset:160 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v12 +; VI-DS128-NEXT: v_bfe_i32 v3, v12, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v11 +; VI-DS128-NEXT: v_bfe_i32 v1, v11, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v0, v[1:4] offset:176 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(9) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v14 +; VI-DS128-NEXT: v_bfe_i32 v3, v14, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v13 +; VI-DS128-NEXT: v_bfe_i32 v1, v13, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v0, v[1:4] offset:128 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v16 +; VI-DS128-NEXT: v_bfe_i32 v3, v16, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v15 +; VI-DS128-NEXT: v_bfe_i32 v1, v15, 0, 16 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(8) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 16, v26 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 16, v25 +; VI-DS128-NEXT: v_bfe_i32 v15, v26, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v13, v25, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v0, v[17:20] offset:240 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v8, 16, v22 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v21 +; VI-DS128-NEXT: v_bfe_i32 v7, v22, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v5, v21, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 16, v24 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 16, v23 +; VI-DS128-NEXT: v_bfe_i32 v11, v24, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v9, v23, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v0, v[13:16] offset:64 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(9) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 16, v30 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 16, v28 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v18, 16, v27 +; VI-DS128-NEXT: v_bfe_i32 v19, v28, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v17, v27, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 16, v29 +; VI-DS128-NEXT: v_bfe_i32 v15, v30, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v13, v29, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v0, v[1:4] offset:144 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(9) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v36 +; VI-DS128-NEXT: ds_write_b128 v0, v[5:8] offset:96 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v35 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v8, 16, v34 +; VI-DS128-NEXT: ds_write_b128 v0, v[9:12] offset:112 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v33 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 16, v32 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 16, v31 +; VI-DS128-NEXT: ds_write_b128 v0, v[17:20] offset:80 +; VI-DS128-NEXT: v_bfe_i32 v3, v36, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v1, v35, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v7, v34, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v5, v33, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v11, v32, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v9, v31, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v0, v[13:16] offset:32 +; VI-DS128-NEXT: ds_write_b128 v0, v[9:12] offset:48 +; VI-DS128-NEXT: ds_write_b128 v0, v[5:8] +; VI-DS128-NEXT: ds_write_b128 v0, v[1:4] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v64i16_to_v64i32: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-DS128-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DS128-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DS128-NEXT: s_mov_b32 s14, -1 -; GFX9-DS128-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s1 -; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v32 -; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 -; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 -; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 -; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10 -; GFX9-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v0, v10, 0, 16 -; GFX9-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_nop 0 -; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v9 +; GFX9-DS128-NEXT: v_mov_b32_e32 v33, s1 +; GFX9-DS128-NEXT: ds_read_b128 v[1:4], v33 offset:112 +; GFX9-DS128-NEXT: ds_read_b128 v[5:8], v33 offset:96 +; GFX9-DS128-NEXT: ds_read_b128 v[9:12], v33 offset:80 +; GFX9-DS128-NEXT: ds_read_b128 v[13:16], v33 offset:64 +; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DS128-NEXT: ds_read_b128 v[21:24], v33 +; GFX9-DS128-NEXT: ds_read_b128 v[25:28], v33 offset:16 +; GFX9-DS128-NEXT: ds_read_b128 v[29:32], v33 offset:32 +; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v33 offset:48 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 16, v2 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v18, 16, v1 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v2, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v17, v1, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[17:20] offset:224 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 16, v4 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v4, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v18, 16, v3 +; GFX9-DS128-NEXT: v_bfe_i32 v17, v3, 0, 16 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v6 +; GFX9-DS128-NEXT: v_bfe_i32 v3, v6, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v5 +; GFX9-DS128-NEXT: v_bfe_i32 v1, v5, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[1:4] offset:192 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8 -; GFX9-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v17 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v16 -; GFX9-DS128-NEXT: v_bfe_i32 v10, v19, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v8, v18, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v14, v17, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v12, v16, 0, 16 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v27 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v26 -; GFX9-DS128-NEXT: v_bfe_i32 v18, v27, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v16, v26, 0, 16 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v36 -; GFX9-DS128-NEXT: v_bfe_i32 v26, v36, 0, 16 -; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 -; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 -; GFX9-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 -; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_nop 0 -; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v37 -; GFX9-DS128-NEXT: v_bfe_i32 v46, v39, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v44, v38, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40 -; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 -; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 -; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 -; GFX9-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 -; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 -; GFX9-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v30, v34, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v28, v33, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v49, 16, v36 -; GFX9-DS128-NEXT: v_bfe_i32 v48, v36, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v43 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v42 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v55, 16, v41 -; GFX9-DS128-NEXT: v_bfe_i32 v35, v43, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v33, v42, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v54, v41, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v59 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v58 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v57 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v56 -; GFX9-DS128-NEXT: v_bfe_i32 v61, v59, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v59, v58, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v57, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v56, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v43, 16, v40 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v41, 16, v39 -; GFX9-DS128-NEXT: v_bfe_i32 v42, v40, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v40, v39, 0, 16 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:224 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[40:43] offset:240 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:192 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[59:62] offset:208 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[52:55] offset:160 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[33:36] offset:176 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[48:51] offset:128 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:64 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:48 -; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] -; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v7 +; GFX9-DS128-NEXT: v_bfe_i32 v1, v7, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[1:4] offset:208 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(8) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v10 +; GFX9-DS128-NEXT: v_bfe_i32 v3, v10, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v9 +; GFX9-DS128-NEXT: v_bfe_i32 v1, v9, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[1:4] offset:160 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v12 +; GFX9-DS128-NEXT: v_bfe_i32 v3, v12, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v11 +; GFX9-DS128-NEXT: v_bfe_i32 v1, v11, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[1:4] offset:176 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(9) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v14 +; GFX9-DS128-NEXT: v_bfe_i32 v3, v14, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v13 +; GFX9-DS128-NEXT: v_bfe_i32 v1, v13, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[1:4] offset:128 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v16 +; GFX9-DS128-NEXT: v_bfe_i32 v3, v16, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v15 +; GFX9-DS128-NEXT: v_bfe_i32 v1, v15, 0, 16 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 16, v30 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 16, v29 +; GFX9-DS128-NEXT: v_bfe_i32 v15, v30, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v13, v29, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[17:20] offset:240 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v8, 16, v34 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v33 +; GFX9-DS128-NEXT: v_bfe_i32 v7, v34, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v5, v33, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 16, v36 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v10, 16, v35 +; GFX9-DS128-NEXT: v_bfe_i32 v11, v36, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v9, v35, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[13:16] offset:64 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 16, v26 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 16, v32 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v18, 16, v31 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v32, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v17, v31, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 16, v25 +; GFX9-DS128-NEXT: v_bfe_i32 v15, v26, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v13, v25, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[1:4] offset:144 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v24 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[5:8] offset:96 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v2, 16, v23 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v8, 16, v22 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[9:12] offset:112 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v21 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 16, v28 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v10, 16, v27 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[17:20] offset:80 +; GFX9-DS128-NEXT: v_bfe_i32 v3, v24, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v1, v23, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v7, v22, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v5, v21, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v11, v28, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v9, v27, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[13:16] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[9:12] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v0, v[5:8] +; GFX9-DS128-NEXT: ds_write_b128 v0, v[1:4] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <64 x i16>, ptr addrspace(3) %in %ext = sext <64 x i16> %load to <64 x i32> @@ -5905,6 +5694,7 @@ define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; SI-NEXT: v_mov_b32_e32 v15, v5 ; SI-NEXT: v_mov_b32_e32 v17, v5 ; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v20, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 @@ -5914,11 +5704,10 @@ define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v1 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v2 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: ds_write2_b64 v0, v[8:9], v[6:7] offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v0, v[12:13], v[4:5] offset0:2 offset1:3 -; SI-NEXT: ds_write2_b64 v0, v[10:11], v[16:17] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v0, v[14:15], v[18:19] offset1:1 +; SI-NEXT: ds_write2_b64 v20, v[8:9], v[6:7] offset0:6 offset1:7 +; SI-NEXT: ds_write2_b64 v20, v[12:13], v[4:5] offset0:2 offset1:3 +; SI-NEXT: ds_write2_b64 v20, v[10:11], v[16:17] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v20, v[14:15], v[18:19] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v8i16_to_v8i64: @@ -6403,46 +6192,50 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out ; SI-NEXT: v_mov_b32_e32 v4, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 -; SI-NEXT: v_mov_b32_e32 v9, 0 +; SI-NEXT: v_mov_b32_e32 v8, 0 ; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1 -; SI-NEXT: v_mov_b32_e32 v11, v9 -; SI-NEXT: v_mov_b32_e32 v13, v9 -; SI-NEXT: v_mov_b32_e32 v15, v9 -; SI-NEXT: v_mov_b32_e32 v17, v9 -; SI-NEXT: v_mov_b32_e32 v20, s0 +; SI-NEXT: v_mov_b32_e32 v10, v8 +; SI-NEXT: v_mov_b32_e32 v12, v8 +; SI-NEXT: v_mov_b32_e32 v14, v8 +; SI-NEXT: v_mov_b32_e32 v15, v8 +; SI-NEXT: v_mov_b32_e32 v17, v8 +; SI-NEXT: v_mov_b32_e32 v18, v8 +; SI-NEXT: v_mov_b32_e32 v20, v8 +; SI-NEXT: v_mov_b32_e32 v21, v8 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v31, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v1 -; SI-NEXT: ds_write2_b64 v20, v[16:17], v[14:15] offset0:10 offset1:11 -; SI-NEXT: v_mov_b32_e32 v16, v9 -; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v3 -; SI-NEXT: ds_write2_b64 v20, v[14:15], v[12:13] offset0:14 offset1:15 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v7 -; SI-NEXT: ds_write2_b64 v20, v[15:16], v[10:11] offset0:6 offset1:7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v6 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v5, v9 -; SI-NEXT: ds_write2_b64 v20, v[4:5], v[8:9] offset0:2 offset1:3 -; SI-NEXT: v_mov_b32_e32 v19, v9 -; SI-NEXT: v_mov_b32_e32 v8, v9 -; SI-NEXT: v_mov_b32_e32 v15, v9 -; SI-NEXT: v_mov_b32_e32 v2, v9 -; SI-NEXT: v_mov_b32_e32 v4, v9 -; SI-NEXT: ds_write2_b64 v20, v[18:19], v[12:13] offset0:8 offset1:9 -; SI-NEXT: ds_write2_b64 v20, v[16:17], v[14:15] offset0:12 offset1:13 -; SI-NEXT: ds_write2_b64 v20, v[10:11], v[1:2] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v20, v[7:8], v[3:4] offset1:1 +; SI-NEXT: ds_write2_b64 v31, v[16:17], v[13:14] offset0:10 offset1:11 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_mov_b32_e32 v24, v8 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v3 +; SI-NEXT: ds_write2_b64 v31, v[19:20], v[11:12] offset0:14 offset1:15 +; SI-NEXT: v_mov_b32_e32 v11, v8 +; SI-NEXT: v_mov_b32_e32 v28, v8 +; SI-NEXT: v_mov_b32_e32 v30, v8 +; SI-NEXT: s_waitcnt lgkmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v7 +; SI-NEXT: ds_write2_b64 v31, v[20:21], v[9:10] offset0:6 offset1:7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v5 +; SI-NEXT: ds_write2_b64 v31, v[12:13], v[7:8] offset0:2 offset1:3 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v0 +; SI-NEXT: ds_write2_b64 v31, v[14:15], v[25:26] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v31, v[17:18], v[10:11] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v31, v[21:22], v[27:28] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v31, v[23:24], v[29:30] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v16i16_to_v16i64: @@ -6450,96 +6243,104 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, 0 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v8 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v13, v8 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v21, v8 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, v8 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v14, s0 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v14, v8 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v3 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v5 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[12:13], v[9:10] offset0:10 offset1:11 -; VI-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v6, v8 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[5:6], v[9:10] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[5:6], v[9:10] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; VI-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; VI-NO-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; VI-NO-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v5 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; VI-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v4 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v4 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v5, s0 +; VI-NO-DS128-NEXT: ds_write2_b64 v5, v[20:21], v[7:8] offset0:8 offset1:9 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v20, v8 +; VI-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v6 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NO-DS128-NEXT: ds_write2_b64 v5, v[17:18], v[19:20] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, v8 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v17, v8 +; VI-NO-DS128-NEXT: ds_write2_b64 v5, v[16:17], v[6:7] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, v8 +; VI-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; VI-NO-DS128-NEXT: ds_write2_b64 v5, v[13:14], v[15:16] offset0:14 offset1:15 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, v8 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v13, v8 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[12:13], v[7:8] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[3:4], v[9:10] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v5, v[3:4], v[12:13] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v3, v8 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, v8 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, v8 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; VI-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[2:3], v[6:7] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v5, v[2:3], v[11:12] offset0:4 offset1:5 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v8 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v6, v8 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, v8 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[1:2], v[5:6] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v5, v[1:2], v[10:11] offset0:2 offset1:3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v8 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, v8 -; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[0:1], v[11:12] offset1:1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v8 +; VI-NO-DS128-NEXT: ds_write2_b64 v5, v[0:1], v[9:10] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v16i16_to_v16i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v21, v8 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v18, v8 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v14, v8 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v15, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v3 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v5 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[11:12], v[9:10] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[5:6], v[9:10] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[5:6], v[9:10] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v5 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v4 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v4 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v5, v[20:21], v[7:8] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v20, v8 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v6 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v5, v[17:18], v[19:20] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v17, v8 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v5, v[16:17], v[6:7] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, v8 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[10:11], v[7:8] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v5, v[13:14], v[15:16] offset0:14 offset1:15 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[9:10] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v5, v[3:4], v[12:13] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v3, v8 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v7, v8 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[2:3], v[6:7] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v5, v[2:3], v[11:12] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[1:2], v[5:6] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v5, v[1:2], v[10:11] offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v8 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[0:1], v[13:14] offset1:1 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v5, v[0:1], v[9:10] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v16i16_to_v16i64: @@ -6696,105 +6497,105 @@ define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 -; VI-DS128-NEXT: v_mov_b32_e32 v26, 0 -; VI-DS128-NEXT: v_mov_b32_e32 v22, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v24, v26 +; VI-DS128-NEXT: v_mov_b32_e32 v8, 0 +; VI-DS128-NEXT: v_mov_b32_e32 v24, v8 +; VI-DS128-NEXT: v_mov_b32_e32 v26, v8 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_mov_b32_e32 v5, s1 -; VI-DS128-NEXT: ds_read_b128 v[0:3], v5 -; VI-DS128-NEXT: ds_read_b128 v[13:16], v5 offset:16 -; VI-DS128-NEXT: v_mov_b32_e32 v11, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v19, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v8, v26 +; VI-DS128-NEXT: v_mov_b32_e32 v4, s1 +; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 +; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 +; VI-DS128-NEXT: v_mov_b32_e32 v27, s0 +; VI-DS128-NEXT: v_mov_b32_e32 v21, v8 +; VI-DS128-NEXT: v_mov_b32_e32 v18, v8 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v2 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v13 -; VI-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v13 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; VI-DS128-NEXT: v_and_b32_e32 v25, 0xffff, v14 -; VI-DS128-NEXT: v_mov_b32_e32 v14, s0 -; VI-DS128-NEXT: v_mov_b32_e32 v13, v26 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v16 -; VI-DS128-NEXT: ds_write_b128 v14, v[21:24] offset:64 -; VI-DS128-NEXT: v_mov_b32_e32 v21, v26 -; VI-DS128-NEXT: ds_write_b128 v14, v[10:13] offset:32 -; VI-DS128-NEXT: v_mov_b32_e32 v10, v26 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v7 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; VI-DS128-NEXT: ds_write_b128 v27, v[23:26] offset:112 +; VI-DS128-NEXT: v_mov_b32_e32 v23, v8 +; VI-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v2 +; VI-DS128-NEXT: ds_write_b128 v27, v[20:23] offset:96 +; VI-DS128-NEXT: v_mov_b32_e32 v20, v8 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v1 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v5 +; VI-DS128-NEXT: v_mov_b32_e32 v10, v8 +; VI-DS128-NEXT: ds_write_b128 v27, v[17:20] offset:32 +; VI-DS128-NEXT: v_mov_b32_e32 v15, v8 +; VI-DS128-NEXT: v_mov_b32_e32 v17, v8 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; VI-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v0 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; VI-DS128-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; VI-DS128-NEXT: ds_write_b128 v14, v[18:21] offset:112 -; VI-DS128-NEXT: v_mov_b32_e32 v16, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v18, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v1, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v3, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v28, v26 -; VI-DS128-NEXT: ds_write_b128 v14, v[7:10] offset:16 -; VI-DS128-NEXT: v_mov_b32_e32 v5, v26 -; VI-DS128-NEXT: v_mov_b32_e32 v7, v26 -; VI-DS128-NEXT: ds_write_b128 v14, v[15:18] offset:96 -; VI-DS128-NEXT: ds_write_b128 v14, v[0:3] offset:48 -; VI-DS128-NEXT: ds_write_b128 v14, v[25:28] offset:80 -; VI-DS128-NEXT: ds_write_b128 v14, v[4:7] +; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; VI-DS128-NEXT: ds_write_b128 v27, v[7:10] offset:80 +; VI-DS128-NEXT: v_mov_b32_e32 v5, v8 +; VI-DS128-NEXT: v_mov_b32_e32 v7, v8 +; VI-DS128-NEXT: v_mov_b32_e32 v1, v8 +; VI-DS128-NEXT: v_mov_b32_e32 v3, v8 +; VI-DS128-NEXT: ds_write_b128 v27, v[14:17] offset:16 +; VI-DS128-NEXT: v_mov_b32_e32 v12, v8 +; VI-DS128-NEXT: v_mov_b32_e32 v14, v8 +; VI-DS128-NEXT: ds_write_b128 v27, v[4:7] offset:64 +; VI-DS128-NEXT: ds_write_b128 v27, v[0:3] offset:48 +; VI-DS128-NEXT: ds_write_b128 v27, v[11:14] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v16i16_to_v16i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-DS128-NEXT: v_mov_b32_e32 v25, 0 -; GFX9-DS128-NEXT: v_mov_b32_e32 v21, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v25 +; GFX9-DS128-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DS128-NEXT: v_mov_b32_e32 v25, v4 +; GFX9-DS128-NEXT: v_mov_b32_e32 v27, v4 +; GFX9-DS128-NEXT: v_mov_b32_e32 v22, v4 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4 -; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v28, s0 -; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v12, v25 +; GFX9-DS128-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v5 +; GFX9-DS128-NEXT: ds_read_b128 v[5:8], v5 offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v19, v4 +; GFX9-DS128-NEXT: v_mov_b32_e32 v16, v4 +; GFX9-DS128-NEXT: v_mov_b32_e32 v13, v4 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v2 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v7 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; GFX9-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v6 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[20:23] offset:112 -; GFX9-DS128-NEXT: v_mov_b32_e32 v20, v25 -; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v2 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[17:20] offset:96 -; GFX9-DS128-NEXT: v_mov_b32_e32 v17, v25 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GFX9-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v1 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[14:17] offset:32 -; GFX9-DS128-NEXT: v_mov_b32_e32 v14, v25 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v7 +; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v7 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v5 ; GFX9-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v5 -; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v1, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v27, v25 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[11:14] offset:16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v9, v25 -; GFX9-DS128-NEXT: v_mov_b32_e32 v11, v25 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[4:7] offset:64 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[0:3] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[24:27] offset:80 -; GFX9-DS128-NEXT: ds_write_b128 v28, v[8:11] +; GFX9-DS128-NEXT: v_mov_b32_e32 v7, s0 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v8 +; GFX9-DS128-NEXT: ds_write_b128 v7, v[24:27] offset:64 +; GFX9-DS128-NEXT: v_mov_b32_e32 v24, v4 +; GFX9-DS128-NEXT: ds_write_b128 v7, v[21:24] offset:112 +; GFX9-DS128-NEXT: v_mov_b32_e32 v21, v4 +; GFX9-DS128-NEXT: v_and_b32_e32 v15, 0xffff, v2 +; GFX9-DS128-NEXT: ds_write_b128 v7, v[18:21] offset:96 +; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GFX9-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v0 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v1 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX9-DS128-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-DS128-NEXT: ds_write_b128 v7, v[15:18] offset:32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-DS128-NEXT: ds_write_b128 v7, v[3:6] offset:80 +; GFX9-DS128-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-DS128-NEXT: ds_write_b128 v7, v[12:15] offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-DS128-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-DS128-NEXT: ds_write_b128 v7, v[0:3] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v7, v[9:12] ; GFX9-DS128-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(3) %in %ext = zext <16 x i16> %load to <16 x i64> @@ -6811,55 +6612,55 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 ; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1 -; SI-NEXT: v_mov_b32_e32 v18, s0 +; SI-NEXT: v_mov_b32_e32 v24, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_mov_b32_e32 v12, v3 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: v_mov_b32_e32 v14, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v7 +; SI-NEXT: v_mov_b32_e32 v16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v4 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v5 ; SI-NEXT: v_ashrrev_i32_e32 v8, 16, v5 -; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v3 -; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:14 offset1:15 -; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v1 -; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v1 -; SI-NEXT: v_bfe_i32 v12, v1, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:10 offset1:11 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v7 ; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v7 -; SI-NEXT: v_bfe_i32 v12, v14, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:6 offset1:7 -; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 -; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 -; SI-NEXT: v_bfe_i32 v5, v6, 0, 16 -; SI-NEXT: v_bfe_i32 v10, v0, 0, 16 -; SI-NEXT: v_bfe_i32 v7, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v12, v19, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v3 +; SI-NEXT: v_ashrrev_i32_e32 v12, 16, v3 +; SI-NEXT: v_bfe_i32 v14, v14, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; SI-NEXT: ds_write2_b64 v24, v[14:15], v[12:13] offset0:14 offset1:15 +; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v1 +; SI-NEXT: v_ashrrev_i32_e32 v12, 16, v1 +; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v5, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v14, v6, 0, 16 +; SI-NEXT: v_bfe_i32 v16, v16, 0, 16 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v1, 0, 16 +; SI-NEXT: v_bfe_i32 v18, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v20, v20, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: v_bfe_i32 v14, v17, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; SI-NEXT: ds_write2_b64 v24, v[1:2], v[12:13] offset0:10 offset1:11 ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; SI-NEXT: v_bfe_i32 v16, v16, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; SI-NEXT: ds_write2_b64 v18, v[3:4], v[8:9] offset0:2 offset1:3 -; SI-NEXT: v_bfe_i32 v3, v15, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; SI-NEXT: v_bfe_i32 v12, v17, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: ds_write2_b64 v18, v[7:8], v[3:4] offset0:12 offset1:13 -; SI-NEXT: ds_write2_b64 v18, v[10:11], v[16:17] offset0:8 offset1:9 -; SI-NEXT: ds_write2_b64 v18, v[5:6], v[14:15] offset0:4 offset1:5 -; SI-NEXT: ds_write2_b64 v18, v[1:2], v[12:13] offset1:1 +; SI-NEXT: v_bfe_i32 v22, v21, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NEXT: v_bfe_i32 v7, v19, 0, 16 +; SI-NEXT: ds_write2_b64 v24, v[16:17], v[10:11] offset0:6 offset1:7 +; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; SI-NEXT: ds_write2_b64 v24, v[5:6], v[8:9] offset0:2 offset1:3 +; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; SI-NEXT: ds_write2_b64 v24, v[18:19], v[7:8] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v24, v[0:1], v[22:23] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v24, v[14:15], v[12:13] offset0:4 offset1:5 +; SI-NEXT: ds_write2_b64 v24, v[3:4], v[20:21] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64: @@ -6870,59 +6671,59 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v19, s0 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v21, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v4, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[4:5], v[14:15] offset0:10 offset1:11 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, v7 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; VI-NO-DS128-NEXT: ds_write2_b64 v21, v[18:19], v[16:17] offset0:8 offset1:9 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, v3 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; VI-NO-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; VI-NO-DS128-NEXT: ds_write2_b64 v21, v[18:19], v[16:17] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v6, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; VI-NO-DS128-NEXT: ds_write2_b64 v21, v[19:20], v[16:17] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v3 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v3, v7 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v6, v3, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v12, v11, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; VI-NO-DS128-NEXT: v_bfe_i32 v18, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[8:9] offset1:1 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-NO-DS128-NEXT: ds_write2_b64 v21, v[6:7], v[4:5] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: ds_write2_b64 v21, v[2:3], v[14:15] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v21, v[16:17], v[12:13] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v21, v[0:1], v[10:11] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v21, v[18:19], v[8:9] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64: @@ -6932,59 +6733,59 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v19, s0 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v21, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[4:5], v[14:15] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, v7 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v21, v[18:19], v[16:17] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v18, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v21, v[18:19], v[16:17] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v19, v6, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v21, v[19:20], v[16:17] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v3, v7 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v3, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v11, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[8:9] offset1:1 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v21, v[6:7], v[4:5] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v21, v[2:3], v[14:15] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v21, v[16:17], v[12:13] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v21, v[0:1], v[10:11] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v21, v[18:19], v[8:9] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v16i16_to_v16i64: @@ -7163,124 +6964,124 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 -; VI-DS128-NEXT: ds_read_b128 v[3:6], v0 -; VI-DS128-NEXT: ds_read_b128 v[7:10], v0 offset:16 +; VI-DS128-NEXT: v_mov_b32_e32 v4, s1 +; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 +; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 +; VI-DS128-NEXT: v_mov_b32_e32 v24, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_mov_b32_e32 v18, v6 +; VI-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; VI-DS128-NEXT: v_bfe_i32 v18, v0, 0, 16 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_bfe_i32 v11, v8, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-DS128-NEXT: v_bfe_i32 v13, v8, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_mov_b32_e32 v8, s0 -; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:80 -; VI-DS128-NEXT: v_bfe_i32 v11, v7, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 -; VI-DS128-NEXT: v_mov_b32_e32 v15, v10 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10 -; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:64 -; VI-DS128-NEXT: v_bfe_i32 v11, v15, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:112 -; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; VI-DS128-NEXT: v_bfe_i32 v19, v5, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; VI-DS128-NEXT: v_bfe_i32 v12, v1, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-DS128-NEXT: ds_write_b128 v24, v[19:22] offset:80 +; VI-DS128-NEXT: v_bfe_i32 v20, v4, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v22, v0, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v14, v1, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; VI-DS128-NEXT: v_mov_b32_e32 v1, v7 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; VI-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:64 +; VI-DS128-NEXT: v_mov_b32_e32 v0, v3 +; VI-DS128-NEXT: v_bfe_i32 v20, v1, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v7 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; VI-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; VI-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:96 -; VI-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v16, v19, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v3, v6, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v22, v1, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 31, v22 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-DS128-NEXT: ds_write_b128 v24, v[3:6] offset:96 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:48 -; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32 -; VI-DS128-NEXT: ds_write_b128 v8, v[10:13] offset:16 -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] +; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; VI-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112 +; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 +; VI-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:32 +; VI-DS128-NEXT: ds_write_b128 v24, v[12:15] offset:16 +; VI-DS128-NEXT: ds_write_b128 v24, v[8:11] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v16i16_to_v16i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-DS128-NEXT: ds_read_b128 v[3:6], v0 -; GFX9-DS128-NEXT: ds_read_b128 v[7:10], v0 offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4 +; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX9-DS128-NEXT: v_bfe_i32 v18, v0, 0, 16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX9-DS128-NEXT: v_bfe_i32 v11, v8, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v3, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:80 -; GFX9-DS128-NEXT: v_bfe_i32 v11, v7, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v10 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:64 -; GFX9-DS128-NEXT: v_bfe_i32 v11, v15, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:112 -; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v6 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:96 -; GFX9-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v16, v19, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v5, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX9-DS128-NEXT: v_bfe_i32 v12, v1, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[19:22] offset:80 +; GFX9-DS128-NEXT: v_bfe_i32 v20, v4, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v22, v0, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v14, v1, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GFX9-DS128-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GFX9-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:64 +; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-DS128-NEXT: v_bfe_i32 v20, v1, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX9-DS128-NEXT: v_bfe_i32 v3, v6, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v22, v1, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 31, v22 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[3:6] offset:96 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[10:13] offset:16 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[12:15] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[8:11] ; GFX9-DS128-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(3) %in %ext = sext <16 x i16> %load to <16 x i64> @@ -7292,239 +7093,253 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI-LABEL: local_zextload_v32i16_to_v32i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v13, s1 ; SI-NEXT: s_mov_b32 m0, -1 -; SI-NEXT: ds_read2_b64 v[2:5], v0 offset0:2 offset1:3 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: ds_read2_b64 v[6:9], v0 offset1:1 -; SI-NEXT: v_mov_b32_e32 v19, v1 -; SI-NEXT: v_mov_b32_e32 v21, v1 -; SI-NEXT: v_mov_b32_e32 v22, s0 -; SI-NEXT: s_waitcnt lgkmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v5 -; SI-NEXT: ds_read2_b64 v[10:13], v0 offset0:4 offset1:5 -; SI-NEXT: ds_read2_b64 v[14:17], v0 offset0:6 offset1:7 -; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:14 offset1:15 +; SI-NEXT: ds_read2_b64 v[5:8], v13 offset0:2 offset1:3 +; SI-NEXT: ds_read2_b64 v[0:3], v13 offset1:1 +; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: ds_read2_b64 v[9:12], v13 offset0:4 offset1:5 +; SI-NEXT: ds_read2_b64 v[13:16], v13 offset0:6 offset1:7 +; SI-NEXT: s_waitcnt lgkmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v8 +; SI-NEXT: v_mov_b32_e32 v26, s0 +; SI-NEXT: ds_write2_b64 v26, v[19:20], v[17:18] offset0:14 offset1:15 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v6 +; SI-NEXT: ds_write2_b64 v26, v[19:20], v[17:18] offset0:10 offset1:11 +; SI-NEXT: v_mov_b32_e32 v17, v4 +; SI-NEXT: v_mov_b32_e32 v19, v4 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: s_waitcnt lgkmcnt(4) ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v3 -; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:10 offset1:11 +; SI-NEXT: ds_write2_b64 v26, v[20:21], v[18:19] offset0:6 offset1:7 +; SI-NEXT: v_mov_b32_e32 v23, v4 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v1 +; SI-NEXT: ds_write2_b64 v26, v[22:23], v[20:21] offset0:2 offset1:3 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt lgkmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v9 -; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:6 offset1:7 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; SI-NEXT: ds_write2_b64 v26, v[22:23], v[20:21] offset0:30 offset1:31 +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v20, v4 +; SI-NEXT: v_mov_b32_e32 v22, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v14 +; SI-NEXT: ds_write2_b64 v26, v[21:22], v[18:19] offset0:26 offset1:27 +; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: v_mov_b32_e32 v14, v4 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; SI-NEXT: ds_write2_b64 v26, v[18:19], v[16:17] offset0:22 offset1:23 +; SI-NEXT: v_mov_b32_e32 v12, v4 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v7 -; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:2 offset1:3 -; SI-NEXT: s_waitcnt lgkmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v17 -; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:30 offset1:31 -; SI-NEXT: v_mov_b32_e32 v18, v1 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; SI-NEXT: v_mov_b32_e32 v20, v1 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v15 -; SI-NEXT: ds_write2_b64 v22, v[19:20], v[17:18] offset0:26 offset1:27 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; SI-NEXT: ds_write2_b64 v22, v[19:20], v[17:18] offset0:22 offset1:23 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: ds_write2_b64 v22, v[4:5], v[17:18] offset0:12 offset1:13 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v2 -; SI-NEXT: v_mov_b32_e32 v4, v1 -; SI-NEXT: ds_write2_b64 v22, v[17:18], v[3:4] offset0:8 offset1:9 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_mov_b32_e32 v9, v1 -; SI-NEXT: v_mov_b32_e32 v7, v1 -; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: ds_write2_b64 v22, v[8:9], v[2:3] offset0:4 offset1:5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v10 -; SI-NEXT: ds_write2_b64 v22, v[6:7], v[4:5] offset1:1 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v11 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v14 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v16 -; SI-NEXT: v_mov_b32_e32 v6, v1 -; SI-NEXT: ds_write2_b64 v22, v[5:6], v[0:1] offset0:18 offset1:19 -; SI-NEXT: v_mov_b32_e32 v11, v1 -; SI-NEXT: v_mov_b32_e32 v5, v1 -; SI-NEXT: v_mov_b32_e32 v13, v1 -; SI-NEXT: v_mov_b32_e32 v16, v1 -; SI-NEXT: ds_write2_b64 v22, v[19:20], v[12:13] offset0:28 offset1:29 -; SI-NEXT: ds_write2_b64 v22, v[17:18], v[15:16] offset0:24 offset1:25 -; SI-NEXT: ds_write2_b64 v22, v[10:11], v[2:3] offset0:20 offset1:21 -; SI-NEXT: ds_write2_b64 v22, v[4:5], v[8:9] offset0:16 offset1:17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: ds_write2_b64 v26, v[7:8], v[18:19] offset0:12 offset1:13 +; SI-NEXT: v_mov_b32_e32 v7, v4 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: ds_write2_b64 v26, v[5:6], v[18:19] offset0:8 offset1:9 +; SI-NEXT: v_mov_b32_e32 v5, v4 +; SI-NEXT: v_mov_b32_e32 v21, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; SI-NEXT: ds_write2_b64 v26, v[6:7], v[3:4] offset0:18 offset1:19 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v25, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v2 +; SI-NEXT: ds_write2_b64 v26, v[19:20], v[4:5] offset0:4 offset1:5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v11 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: ds_write2_b64 v26, v[0:1], v[20:21] offset1:1 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 +; SI-NEXT: ds_write2_b64 v26, v[13:14], v[22:23] offset0:28 offset1:29 +; SI-NEXT: ds_write2_b64 v26, v[11:12], v[2:3] offset0:24 offset1:25 +; SI-NEXT: ds_write2_b64 v26, v[16:17], v[5:6] offset0:20 offset1:21 +; SI-NEXT: ds_write2_b64 v26, v[7:8], v[24:25] offset0:16 offset1:17 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_zextload_v32i16_to_v32i64: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v5, 0 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v19, v5 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v21, v5 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, 0 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v19, v1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v21, v1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_read2_b64 v[6:9], v4 offset0:4 offset1:5 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v22, s0 -; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v4 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v4 offset1:1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[2:5], v0 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v25, s0 +; VI-NO-DS128-NEXT: ds_read2_b64 v[6:9], v0 offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v0 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v0 offset1:1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v22, v1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v24, v1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v2 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[20:21] offset0:28 offset1:29 -; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v1 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[1:2] offset0:26 offset1:27 -; VI-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[1:2], v[18:19] offset0:24 offset1:25 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v4 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[18:19], v[20:21] offset0:28 offset1:29 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, v1 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[3:4], v[18:19] offset0:26 offset1:27 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; VI-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v2 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[20:21], v[18:19] offset0:24 offset1:25 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(5) -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v5 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[18:19] offset0:22 offset1:23 -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v9, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[8:9] offset0:20 offset1:21 -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[7:8] offset0:18 offset1:19 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:16 offset1:17 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v13 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:10 offset1:11 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(10) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v14, v5 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v17 -; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v17 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[4:5], v[13:14] offset0:30 offset1:31 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v13, v5 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v16 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[12:13], v[10:11] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v5 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v3, v5 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v15 -; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[9:10], v[2:3] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v9, v5 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[8:9], v[0:1] offset1:1 +; VI-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v9 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[20:21], v[18:19] offset0:22 offset1:23 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; VI-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v8 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[20:21], v[18:19] offset0:20 offset1:21 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; VI-NO-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, v1 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[7:8], v[21:22] offset0:18 offset1:19 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; VI-NO-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v6 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[23:24], v[21:22] offset0:16 offset1:17 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[0:1], v[23:24] offset0:30 offset1:31 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(9) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; VI-NO-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v13 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v6, v1 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[23:24], v[5:6] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v13, v1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v23, v1 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v11 +; VI-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[12:13], v[22:23] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, v1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v22, v1 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[11:12], v[21:22] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, v1 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(11) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v16 +; VI-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; VI-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v17 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[10:11], v[7:8] offset0:8 offset1:9 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v17, v1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v1 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v15 +; VI-NO-DS128-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[16:17], v[9:10] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, v1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v5, v1 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v21, v1 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[15:16], v[4:5] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v15, v1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, v1 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[20:21], v[18:19] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v25, v[14:15], v[3:4] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_zextload_v32i16_to_v32i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v19, v5 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v21, v5 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v21, v1 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v22, v1 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[6:9], v4 offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v22, s0 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v4 offset1:1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v4 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[2:5], v0 offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[6:9], v0 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v25, s0 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v0 offset1:1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v0 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v24, v1 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v2 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[20:21] offset0:28 offset1:29 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v1 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[1:2] offset0:26 offset1:27 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[1:2], v[18:19] offset0:24 offset1:25 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v8 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[18:19], v[20:21] offset0:28 offset1:29 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[7:8], v[18:19] offset0:26 offset1:27 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[20:21], v[18:19] offset0:24 offset1:25 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[5:6], v[18:19] offset0:22 offset1:23 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v4 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[20:21], v[18:19] offset0:20 offset1:21 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v20, v1 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[3:4], v[19:20] offset0:18 offset1:19 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v2 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[21:22], v[19:20] offset0:16 offset1:17 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v9 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v9 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v9 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[18:19] offset0:22 offset1:23 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[8:9] offset0:20 offset1:21 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v7 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[7:8] offset0:18 offset1:19 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:16 offset1:17 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(7) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v17 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v16 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v14 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v12 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v13 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v14, v5 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[4:5], v[13:14] offset0:30 offset1:31 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v11, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[12:13], v[10:11] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[9:10], v[2:3] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[8:9], v[0:1] offset1:1 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[0:1], v[21:22] offset0:30 offset1:31 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v17 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v16 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[23:24], v[21:22] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v21, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[16:17], v[20:21] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, v1 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v20, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[15:16], v[19:20] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[14:15], v[3:4] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v14, v1 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[13:14], v[18:19] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v13, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[12:13], v[5:6] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, v1 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[11:12], v[8:9] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v25, v[10:11], v[7:8] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_zextload_v32i16_to_v32i64: @@ -7826,193 +7641,169 @@ define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 +; VI-DS128-NEXT: v_mov_b32_e32 v1, 0 +; VI-DS128-NEXT: v_mov_b32_e32 v23, v1 +; VI-DS128-NEXT: v_mov_b32_e32 v25, v1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_mov_b32_e32 v1, s1 -; VI-DS128-NEXT: ds_read_b128 v[3:6], v1 -; VI-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:16 -; VI-DS128-NEXT: v_mov_b32_e32 v52, s0 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; VI-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v8 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v7 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; VI-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v10 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; VI-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v9 -; VI-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:32 -; VI-DS128-NEXT: ds_read_b128 v[29:32], v1 offset:48 -; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v6 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; VI-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v7 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; VI-DS128-NEXT: v_and_b32_e32 v42, 0xffff, v9 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v30 -; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v30 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v32 -; VI-DS128-NEXT: v_and_b32_e32 v48, 0xffff, v32 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v32, 16, v31 -; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v31, 0 -; VI-DS128-NEXT: v_mov_b32_e32 v49, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v51, v31 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v47, 16, v29 -; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v29 -; VI-DS128-NEXT: ds_write_b128 v52, v[48:51] offset:240 -; VI-DS128-NEXT: v_mov_b32_e32 v46, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v48, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v27, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v29, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[45:48] offset:192 -; VI-DS128-NEXT: v_mov_b32_e32 v43, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v45, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[26:29] offset:96 -; VI-DS128-NEXT: v_mov_b32_e32 v24, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v26, v31 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v10 -; VI-DS128-NEXT: ds_write_b128 v52, v[42:45] offset:160 -; VI-DS128-NEXT: v_mov_b32_e32 v40, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v42, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[23:26] offset:112 -; VI-DS128-NEXT: v_mov_b32_e32 v21, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v23, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[39:42] offset:176 -; VI-DS128-NEXT: v_mov_b32_e32 v37, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v39, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[20:23] offset:64 -; VI-DS128-NEXT: v_mov_b32_e32 v18, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v20, v31 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v8 -; VI-DS128-NEXT: v_mov_b32_e32 v8, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v10, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[36:39] offset:128 -; VI-DS128-NEXT: v_mov_b32_e32 v34, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v36, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[17:20] offset:80 -; VI-DS128-NEXT: v_mov_b32_e32 v15, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v17, v31 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; VI-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; VI-DS128-NEXT: ds_write_b128 v52, v[7:10] offset:208 -; VI-DS128-NEXT: ds_write_b128 v52, v[33:36] offset:144 -; VI-DS128-NEXT: v_mov_b32_e32 v5, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v7, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v33, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[14:17] offset:48 -; VI-DS128-NEXT: v_mov_b32_e32 v12, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v14, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v1, v31 -; VI-DS128-NEXT: v_mov_b32_e32 v3, v31 -; VI-DS128-NEXT: ds_write_b128 v52, v[4:7] offset:32 -; VI-DS128-NEXT: ds_write_b128 v52, v[30:33] offset:224 -; VI-DS128-NEXT: ds_write_b128 v52, v[11:14] -; VI-DS128-NEXT: ds_write_b128 v52, v[0:3] offset:16 +; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 +; VI-DS128-NEXT: ds_read_b128 v[6:9], v0 offset:48 +; VI-DS128-NEXT: ds_read_b128 v[10:13], v0 offset:32 +; VI-DS128-NEXT: ds_read_b128 v[14:17], v0 offset:16 +; VI-DS128-NEXT: ds_read_b128 v[18:21], v0 +; VI-DS128-NEXT: v_mov_b32_e32 v31, s0 +; VI-DS128-NEXT: v_mov_b32_e32 v27, v1 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v13 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v24, 16, v7 +; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v7 +; VI-DS128-NEXT: ds_write_b128 v31, v[22:25] offset:208 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v12 +; VI-DS128-NEXT: ds_write_b128 v31, v[22:25] offset:160 +; VI-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v13 +; VI-DS128-NEXT: ds_write_b128 v31, v[24:27] offset:176 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; VI-DS128-NEXT: v_and_b32_e32 v27, 0xffff, v10 +; VI-DS128-NEXT: v_mov_b32_e32 v28, v1 +; VI-DS128-NEXT: v_mov_b32_e32 v30, v1 +; VI-DS128-NEXT: ds_write_b128 v31, v[27:30] offset:128 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v11 +; VI-DS128-NEXT: v_and_b32_e32 v27, 0xffff, v11 +; VI-DS128-NEXT: v_mov_b32_e32 v3, v1 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; VI-DS128-NEXT: v_mov_b32_e32 v5, v1 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(5) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; VI-DS128-NEXT: ds_write_b128 v31, v[27:30] offset:144 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; VI-DS128-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v16 +; VI-DS128-NEXT: v_and_b32_e32 v27, 0xffff, v16 +; VI-DS128-NEXT: v_mov_b32_e32 v14, v1 +; VI-DS128-NEXT: v_mov_b32_e32 v16, v1 +; VI-DS128-NEXT: ds_write_b128 v31, v[2:5] offset:240 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; VI-DS128-NEXT: ds_write_b128 v31, v[13:16] offset:64 +; VI-DS128-NEXT: v_mov_b32_e32 v11, v1 +; VI-DS128-NEXT: v_mov_b32_e32 v13, v1 +; VI-DS128-NEXT: ds_write_b128 v31, v[2:5] offset:192 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(8) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v18 +; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v18 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v20 +; VI-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v20 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; VI-DS128-NEXT: ds_write_b128 v31, v[27:30] offset:96 +; VI-DS128-NEXT: ds_write_b128 v31, v[10:13] offset:80 +; VI-DS128-NEXT: v_mov_b32_e32 v27, v1 +; VI-DS128-NEXT: v_mov_b32_e32 v8, v1 +; VI-DS128-NEXT: v_mov_b32_e32 v10, v1 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v19 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; VI-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; VI-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; VI-DS128-NEXT: v_mov_b32_e32 v18, v1 +; VI-DS128-NEXT: v_mov_b32_e32 v20, v1 +; VI-DS128-NEXT: ds_write_b128 v31, v[24:27] offset:32 +; VI-DS128-NEXT: v_mov_b32_e32 v22, v1 +; VI-DS128-NEXT: v_mov_b32_e32 v24, v1 +; VI-DS128-NEXT: ds_write_b128 v31, v[7:10] +; VI-DS128-NEXT: v_mov_b32_e32 v7, v1 +; VI-DS128-NEXT: ds_write_b128 v31, v[0:3] offset:224 +; VI-DS128-NEXT: ds_write_b128 v31, v[17:20] offset:112 +; VI-DS128-NEXT: ds_write_b128 v31, v[21:24] offset:48 +; VI-DS128-NEXT: ds_write_b128 v31, v[4:7] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v32i16_to_v32i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DS128-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DS128-NEXT: v_mov_b32_e32 v22, v1 +; GFX9-DS128-NEXT: v_mov_b32_e32 v24, v1 +; GFX9-DS128-NEXT: v_mov_b32_e32 v25, v1 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DS128-NEXT: ds_read_b128 v[3:6], v1 -; GFX9-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v52, s0 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; GFX9-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v8 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v7 -; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v7 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GFX9-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v10 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; GFX9-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v9 -; GFX9-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:32 -; GFX9-DS128-NEXT: ds_read_b128 v[29:32], v1 offset:48 -; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v6 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v38, 16, v7 -; GFX9-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v7 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; GFX9-DS128-NEXT: v_and_b32_e32 v42, 0xffff, v9 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v30 -; GFX9-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v30 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v32 -; GFX9-DS128-NEXT: v_and_b32_e32 v48, 0xffff, v32 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v32, 16, v31 -; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v31, 0 -; GFX9-DS128-NEXT: v_mov_b32_e32 v49, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v51, v31 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v47, 16, v29 -; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v29 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[48:51] offset:240 -; GFX9-DS128-NEXT: v_mov_b32_e32 v46, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v48, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v27, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v29, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[45:48] offset:192 -; GFX9-DS128-NEXT: v_mov_b32_e32 v43, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v45, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[26:29] offset:96 -; GFX9-DS128-NEXT: v_mov_b32_e32 v24, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v26, v31 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v10 -; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v10 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[42:45] offset:160 -; GFX9-DS128-NEXT: v_mov_b32_e32 v40, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v42, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[23:26] offset:112 -; GFX9-DS128-NEXT: v_mov_b32_e32 v21, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[39:42] offset:176 -; GFX9-DS128-NEXT: v_mov_b32_e32 v37, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v39, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[20:23] offset:64 -; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v20, v31 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; GFX9-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v8 -; GFX9-DS128-NEXT: v_mov_b32_e32 v8, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v10, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[36:39] offset:128 -; GFX9-DS128-NEXT: v_mov_b32_e32 v34, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v36, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[17:20] offset:80 -; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v17, v31 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GFX9-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[7:10] offset:208 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[33:36] offset:144 -; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v33, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[14:17] offset:48 -; GFX9-DS128-NEXT: v_mov_b32_e32 v12, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v14, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v1, v31 -; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v31 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[4:7] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[30:33] offset:224 -; GFX9-DS128-NEXT: ds_write_b128 v52, v[11:14] -; GFX9-DS128-NEXT: ds_write_b128 v52, v[0:3] offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-DS128-NEXT: ds_read_b128 v[2:5], v0 offset:32 +; GFX9-DS128-NEXT: ds_read_b128 v[6:9], v0 offset:48 +; GFX9-DS128-NEXT: v_mov_b32_e32 v34, s0 +; GFX9-DS128-NEXT: ds_read_b128 v[13:16], v0 +; GFX9-DS128-NEXT: ds_read_b128 v[17:20], v0 offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v27, v1 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GFX9-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v7 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[21:24] offset:208 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; GFX9-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v4 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[21:24] offset:160 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX9-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v5 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[21:24] offset:176 +; GFX9-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v2 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[24:27] offset:128 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; GFX9-DS128-NEXT: v_and_b32_e32 v27, 0xffff, v3 +; GFX9-DS128-NEXT: v_mov_b32_e32 v28, v1 +; GFX9-DS128-NEXT: v_mov_b32_e32 v30, v1 +; GFX9-DS128-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-DS128-NEXT: v_mov_b32_e32 v12, v1 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GFX9-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[27:30] offset:144 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(5) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v19 +; GFX9-DS128-NEXT: v_mov_b32_e32 v31, v1 +; GFX9-DS128-NEXT: v_mov_b32_e32 v33, v1 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[9:12] offset:240 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GFX9-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v6 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX9-DS128-NEXT: v_and_b32_e32 v27, 0xffff, v20 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[30:33] offset:96 +; GFX9-DS128-NEXT: v_mov_b32_e32 v30, v1 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[9:12] offset:192 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v14 +; GFX9-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v15 +; GFX9-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v15 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v18 +; GFX9-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v18 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[27:30] offset:112 +; GFX9-DS128-NEXT: v_mov_b32_e32 v27, v1 +; GFX9-DS128-NEXT: v_mov_b32_e32 v13, v1 +; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DS128-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-DS128-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[24:27] offset:80 +; GFX9-DS128-NEXT: v_mov_b32_e32 v24, v1 +; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[12:15] +; GFX9-DS128-NEXT: v_mov_b32_e32 v12, v1 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[0:3] offset:224 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[16:19] offset:64 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[21:24] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[4:7] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v34, v[9:12] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(3) %in %ext = zext <32 x i16> %load to <32 x i64> @@ -8027,107 +7818,107 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v12, s1 ; SI-NEXT: s_mov_b32 m0, -1 -; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3 -; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1 +; SI-NEXT: ds_read2_b64 v[0:3], v12 offset0:2 offset1:3 +; SI-NEXT: ds_read2_b64 v[4:7], v12 offset1:1 ; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:6 offset1:7 ; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:4 offset1:5 +; SI-NEXT: v_mov_b32_e32 v16, s0 ; SI-NEXT: s_waitcnt lgkmcnt(3) -; SI-NEXT: v_mov_b32_e32 v18, v7 -; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v7 -; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v7 -; SI-NEXT: v_bfe_i32 v18, v18, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; SI-NEXT: v_mov_b32_e32 v7, s0 -; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:14 offset1:15 -; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v5 -; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v5 -; SI-NEXT: v_bfe_i32 v18, v5, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:10 offset1:11 +; SI-NEXT: v_mov_b32_e32 v19, v3 +; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v3 +; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v3 +; SI-NEXT: v_bfe_i32 v19, v19, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:14 offset1:15 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v1 +; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v1 +; SI-NEXT: v_bfe_i32 v19, v1, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:10 offset1:11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 ; SI-NEXT: s_waitcnt lgkmcnt(4) -; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v3 -; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v3 -; SI-NEXT: v_bfe_i32 v18, v5, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:6 offset1:7 -; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v1 -; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v1 -; SI-NEXT: v_bfe_i32 v18, v1, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:2 offset1:3 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v7 +; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v7 +; SI-NEXT: v_bfe_i32 v19, v1, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:6 offset1:7 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v5 +; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v5 +; SI-NEXT: v_bfe_i32 v19, v5, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:2 offset1:3 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4 ; SI-NEXT: s_waitcnt lgkmcnt(5) ; SI-NEXT: v_mov_b32_e32 v1, v11 -; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v11 -; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v11 -; SI-NEXT: v_bfe_i32 v18, v1, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:30 offset1:31 -; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v9 -; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v9 -; SI-NEXT: v_bfe_i32 v18, v9, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:26 offset1:27 +; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v11 +; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v11 +; SI-NEXT: v_bfe_i32 v19, v1, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:30 offset1:31 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 +; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v9 +; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v9 +; SI-NEXT: v_bfe_i32 v19, v9, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:26 offset1:27 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v8 ; SI-NEXT: s_waitcnt lgkmcnt(6) ; SI-NEXT: v_mov_b32_e32 v1, v15 -; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v15 -; SI-NEXT: v_bfe_i32 v17, v1, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; SI-NEXT: ds_write2_b64 v7, v[17:18], v[15:16] offset0:22 offset1:23 -; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v13 -; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v13 -; SI-NEXT: v_bfe_i32 v17, v13, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; SI-NEXT: ds_write2_b64 v7, v[17:18], v[15:16] offset0:18 offset1:19 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_bfe_i32 v5, v6, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; SI-NEXT: v_bfe_i32 v15, v1, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; SI-NEXT: ds_write2_b64 v7, v[5:6], v[15:16] offset0:12 offset1:13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 -; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 -; SI-NEXT: v_bfe_i32 v5, v1, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; SI-NEXT: ds_write2_b64 v7, v[3:4], v[5:6] offset0:8 offset1:9 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v12 +; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v15 +; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v15 +; SI-NEXT: v_bfe_i32 v19, v1, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:22 offset1:23 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v13 +; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v13 +; SI-NEXT: v_bfe_i32 v19, v13, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:18 offset1:19 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v12 ; SI-NEXT: v_bfe_i32 v1, v12, 0, 16 ; SI-NEXT: v_bfe_i32 v3, v14, 0, 16 ; SI-NEXT: v_bfe_i32 v5, v8, 0, 16 -; SI-NEXT: v_bfe_i32 v8, v10, 0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; SI-NEXT: v_bfe_i32 v9, v0, 0, 16 -; SI-NEXT: v_bfe_i32 v10, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v12, v11, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; SI-NEXT: ds_write2_b64 v7, v[10:11], v[12:13] offset0:4 offset1:5 +; SI-NEXT: v_bfe_i32 v7, v10, 0, 16 +; SI-NEXT: v_bfe_i32 v9, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v11, v6, 0, 16 +; SI-NEXT: v_bfe_i32 v12, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v13, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v17, v17, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; SI-NEXT: v_bfe_i32 v13, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v19, v15, 0, 16 +; SI-NEXT: v_bfe_i32 v20, v21, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; SI-NEXT: ds_write2_b64 v16, v[13:14], v[20:21] offset0:12 offset1:13 ; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: v_bfe_i32 v15, v15, 0, 16 +; SI-NEXT: v_bfe_i32 v14, v26, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; SI-NEXT: v_bfe_i32 v16, v14, 0, 16 +; SI-NEXT: v_bfe_i32 v21, v23, 0, 16 +; SI-NEXT: v_bfe_i32 v22, v22, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; SI-NEXT: ds_write2_b64 v16, v[12:13], v[22:23] offset0:8 offset1:9 +; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; SI-NEXT: v_bfe_i32 v22, v25, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; SI-NEXT: ds_write2_b64 v7, v[9:10], v[16:17] offset1:1 -; SI-NEXT: v_bfe_i32 v17, v18, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; SI-NEXT: v_bfe_i32 v23, v24, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; SI-NEXT: ds_write2_b64 v16, v[11:12], v[23:24] offset0:4 offset1:5 ; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; SI-NEXT: ds_write2_b64 v7, v[8:9], v[17:18] offset0:28 offset1:29 -; SI-NEXT: ds_write2_b64 v7, v[5:6], v[15:16] offset0:24 offset1:25 -; SI-NEXT: ds_write2_b64 v7, v[3:4], v[13:14] offset0:20 offset1:21 -; SI-NEXT: ds_write2_b64 v7, v[1:2], v[11:12] offset0:16 offset1:17 +; SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; SI-NEXT: ds_write2_b64 v16, v[9:10], v[22:23] offset1:1 +; SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; SI-NEXT: ds_write2_b64 v16, v[7:8], v[21:22] offset0:28 offset1:29 +; SI-NEXT: ds_write2_b64 v16, v[5:6], v[14:15] offset0:24 offset1:25 +; SI-NEXT: ds_write2_b64 v16, v[3:4], v[19:20] offset0:20 offset1:21 +; SI-NEXT: ds_write2_b64 v16, v[1:2], v[17:18] offset0:16 offset1:17 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64: @@ -8135,112 +7926,113 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, s1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v7 offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_read2_b64 v[12:15], v7 offset0:4 offset1:5 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, s0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, s1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v8 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_read2_b64 v[13:16], v8 offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v8 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_read2_b64 v[8:11], v8 offset1:1 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, s0 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; VI-NO-DS128-NEXT: v_bfe_i32 v17, v17, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v3, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[19:20], v[17:18] offset0:30 offset1:31 +; VI-NO-DS128-NEXT: v_bfe_i32 v17, v3, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; VI-NO-DS128-NEXT: v_bfe_i32 v18, v3, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; VI-NO-DS128-NEXT: ds_read2_b64 v[3:6], v7 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_read2_b64 v[7:10], v7 offset1:1 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:30 offset1:31 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v18, v2, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[2:3], v[18:19] offset0:28 offset1:29 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:28 offset1:29 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[1:2], v[16:17] offset0:26 offset1:27 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v18, v17, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[16:17], v[18:19] offset0:24 offset1:25 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(6) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v18, v15, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:22 offset1:23 -; VI-NO-DS128-NEXT: v_bfe_i32 v15, v15, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v17, v14, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:20 offset1:21 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v13, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[16:17], v[14:15] offset0:18 offset1:19 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; VI-NO-DS128-NEXT: v_bfe_i32 v15, v12, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v17, v16, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[15:16], v[17:18] offset0:16 offset1:17 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(9) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v1, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[19:20], v[2:3] offset0:26 offset1:27 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[19:20], v[2:3] offset0:24 offset1:25 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, v16 +; VI-NO-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[21:22], v[19:20] offset0:22 offset1:23 +; VI-NO-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; VI-NO-DS128-NEXT: v_bfe_i32 v20, v0, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v15, v15, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v17, v6, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(9) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[5:6], v[15:16] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: v_bfe_i32 v5, v12, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[15:16], v[20:21] offset0:20 offset1:21 +; VI-NO-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v21, v14, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[15:16], v[5:6] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-NO-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[21:22], v[15:16] offset0:18 offset1:19 ; VI-NO-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; VI-NO-DS128-NEXT: v_bfe_i32 v17, v3, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v21, v13, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[21:22], v[15:16] offset0:16 offset1:17 ; VI-NO-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, v7 +; VI-NO-DS128-NEXT: v_bfe_i32 v23, v0, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; VI-NO-DS128-NEXT: v_bfe_i32 v19, v19, 0, 16 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[21:22] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: v_bfe_i32 v17, v10, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[23:24], v[21:22] offset0:14 offset1:15 +; VI-NO-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; VI-NO-DS128-NEXT: v_bfe_i32 v22, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[6:7], v[22:23] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v24, v5, 0, 16 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, v11 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; VI-NO-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v9, v4, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v7, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v8, v9, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-NO-DS128-NEXT: v_bfe_i32 v23, v8, 0, 16 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[24:25], v[6:7] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_bfe_i32 v7, v10, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[19:20] offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[8:9], v[15:16] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[6:7], v[13:14] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[4:5], v[1:2] offset1:1 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[9:10], v[21:22] offset0:8 offset1:9 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[3:4], v[14:15] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[7:8], v[19:20] offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[5:6], v[1:2] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v12, v[23:24], v[17:18] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64: @@ -8250,109 +8042,110 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v8 offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v8 offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v15, s0 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[12:15], v8 offset1:1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[8:11], v8 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, s0 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v17, v17, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v19, v7, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v9, 0, 16 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[19:20], v[17:18] offset0:30 offset1:31 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v17, v7, 0, 16 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v7, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[11:14], v8 offset1:1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[7:10], v8 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:30 offset1:31 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[6:7], v[18:19] offset0:28 offset1:29 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:28 offset1:29 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[5:6], v[16:17] offset0:26 offset1:27 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v17, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(5) +; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v19, v5, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[19:20], v[6:7] offset0:26 offset1:27 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v19, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:24 offset1:25 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[19:20], v[6:7] offset0:24 offset1:25 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v19, v4, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[16:17] offset0:22 offset1:23 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[3:4], v[19:20] offset0:22 offset1:23 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v19, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v21, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[3:4] offset0:20 offset1:21 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[2:3] offset0:18 offset1:19 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[16:17] offset0:16 offset1:17 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(8) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v10, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[3:4] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v9, v9, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[21:22], v[19:20] offset0:20 offset1:21 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v19, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[1:2], v[19:20] offset0:18 offset1:19 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v19, v2, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[21:22], v[19:20] offset0:16 offset1:17 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, v11 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[9:10], v[3:4] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v12 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[8:9], v[3:4] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v20, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v13 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, v14 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[21:22], v[19:20] offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v10, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v11 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v12, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v18, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v19, 0, 16 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[20:21] offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[10:11], v[20:21] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v22, v9, 0, 16 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, v15 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v9, v13, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v7, v0, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v13, v8, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v11, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v13, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v21, v12, 0, 16 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[22:23], v[10:11] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v11, v14, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[10:11], v[12:13] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[8:9], v[1:2] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[5:6] offset1:1 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[13:14], v[19:20] offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[7:8], v[1:2] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[11:12], v[3:4] offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[9:10], v[5:6] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[21:22], v[17:18] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v32i16_to_v32i64: @@ -8705,229 +8498,229 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 offset:48 -; VI-DS128-NEXT: ds_read_b128 v[9:12], v4 offset:32 -; VI-DS128-NEXT: v_mov_b32_e32 v8, s0 -; VI-DS128-NEXT: ds_read_b128 v[17:20], v4 offset:16 -; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 +; VI-DS128-NEXT: v_mov_b32_e32 v14, s0 +; VI-DS128-NEXT: ds_read_b128 v[15:18], v4 offset:32 +; VI-DS128-NEXT: ds_read_b128 v[6:9], v4 offset:16 +; VI-DS128-NEXT: ds_read_b128 v[10:13], v4 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) -; VI-DS128-NEXT: v_bfe_i32 v13, v2, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v19, v2, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-DS128-NEXT: v_mov_b32_e32 v2, v3 +; VI-DS128-NEXT: v_bfe_i32 v21, v2, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-DS128-NEXT: v_mov_b32_e32 v4, v3 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:224 -; VI-DS128-NEXT: v_bfe_i32 v13, v2, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v15, v3, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:240 -; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; VI-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:208 +; VI-DS128-NEXT: ds_write_b128 v14, v[19:22] offset:224 +; VI-DS128-NEXT: v_bfe_i32 v19, v4, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v21, v3, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-DS128-NEXT: ds_write_b128 v14, v[19:22] offset:240 +; VI-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; VI-DS128-NEXT: ds_write_b128 v14, v[19:22] offset:192 +; VI-DS128-NEXT: v_bfe_i32 v19, v1, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(5) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v11 -; VI-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:192 -; VI-DS128-NEXT: v_mov_b32_e32 v13, v12 -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:160 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v12 -; VI-DS128-NEXT: v_bfe_i32 v0, v13, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:176 -; VI-DS128-NEXT: v_bfe_i32 v0, v9, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; VI-DS128-NEXT: v_bfe_i32 v9, v10, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v11, v11, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:144 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(8) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; VI-DS128-NEXT: ds_write_b128 v14, v[19:22] offset:208 +; VI-DS128-NEXT: v_bfe_i32 v20, v17, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v22, v0, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(4) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; VI-DS128-NEXT: ds_write_b128 v14, v[20:23] offset:160 +; VI-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16 +; VI-DS128-NEXT: v_mov_b32_e32 v0, v18 +; VI-DS128-NEXT: v_bfe_i32 v22, v0, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; VI-DS128-NEXT: v_bfe_i32 v24, v0, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; VI-DS128-NEXT: ds_write_b128 v14, v[22:25] offset:176 +; VI-DS128-NEXT: v_bfe_i32 v23, v15, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v25, v0, 0, 16 +; VI-DS128-NEXT: v_mov_b32_e32 v0, v13 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; VI-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; VI-DS128-NEXT: ds_write_b128 v14, v[23:26] offset:128 +; VI-DS128-NEXT: v_bfe_i32 v23, v16, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v25, v0, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; VI-DS128-NEXT: v_bfe_i32 v17, v0, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; VI-DS128-NEXT: ds_write_b128 v14, v[23:26] offset:144 +; VI-DS128-NEXT: v_bfe_i32 v23, v8, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v25, v0, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; VI-DS128-NEXT: v_mov_b32_e32 v0, v9 +; VI-DS128-NEXT: ds_write_b128 v14, v[23:26] offset:96 +; VI-DS128-NEXT: v_bfe_i32 v23, v0, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; VI-DS128-NEXT: v_bfe_i32 v25, v0, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; VI-DS128-NEXT: ds_write_b128 v14, v[23:26] offset:112 +; VI-DS128-NEXT: v_bfe_i32 v23, v6, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v25, v0, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; VI-DS128-NEXT: v_bfe_i32 v19, v10, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; VI-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v11 +; VI-DS128-NEXT: v_bfe_i32 v8, v12, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v14, v[23:26] offset:64 +; VI-DS128-NEXT: v_bfe_i32 v23, v7, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v25, v0, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: v_bfe_i32 v9, v19, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v11, v11, 0, 16 -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:128 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(8) -; VI-DS128-NEXT: v_bfe_i32 v0, v5, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; VI-DS128-NEXT: v_mov_b32_e32 v5, v20 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:96 -; VI-DS128-NEXT: v_bfe_i32 v9, v5, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v20 -; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:112 -; VI-DS128-NEXT: v_bfe_i32 v9, v17, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v18 -; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:64 -; VI-DS128-NEXT: v_bfe_i32 v9, v4, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v15, v5, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v11, v4, 0, 16 -; VI-DS128-NEXT: v_mov_b32_e32 v4, v7 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:80 -; VI-DS128-NEXT: v_bfe_i32 v13, v4, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; VI-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v4, v6, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32 -; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:48 -; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; VI-DS128-NEXT: ds_write_b128 v14, v[23:26] offset:80 +; VI-DS128-NEXT: ds_write_b128 v14, v[8:11] offset:32 +; VI-DS128-NEXT: ds_write_b128 v14, v[15:18] offset:48 +; VI-DS128-NEXT: ds_write_b128 v14, v[19:22] +; VI-DS128-NEXT: ds_write_b128 v14, v[2:5] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v32i16_to_v32i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DS128-NEXT: v_mov_b32_e32 v13, s1 -; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v13 offset:48 -; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v13 offset:32 -; GFX9-DS128-NEXT: v_mov_b32_e32 v12, s0 -; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v13 -; GFX9-DS128-NEXT: ds_read_b128 v[18:21], v13 offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s1 +; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v8 offset:48 +; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v8 offset:32 +; GFX9-DS128-NEXT: v_mov_b32_e32 v18, s0 +; GFX9-DS128-NEXT: ds_read_b128 v[14:17], v8 +; GFX9-DS128-NEXT: ds_read_b128 v[10:13], v8 offset:16 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) -; GFX9-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v6, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-DS128-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-DS128-NEXT: v_bfe_i32 v21, v6, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX9-DS128-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[14:17] offset:224 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v6, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v7, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:240 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v4, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:208 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(5) -; GFX9-DS128-NEXT: v_bfe_i32 v4, v2, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[19:22] offset:224 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v8, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v21, v7, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[19:22] offset:240 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v4, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX9-DS128-NEXT: v_bfe_i32 v21, v4, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[19:22] offset:192 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v5, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v21, v4, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[19:22] offset:208 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(6) +; GFX9-DS128-NEXT: v_bfe_i32 v20, v2, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:192 -; GFX9-DS128-NEXT: v_mov_b32_e32 v13, v3 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-DS128-NEXT: v_bfe_i32 v22, v2, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(5) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[20:23] offset:160 +; GFX9-DS128-NEXT: v_bfe_i32 v21, v2, 0, 16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-DS128-NEXT: v_bfe_i32 v22, v2, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:160 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:176 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-DS128-NEXT: v_bfe_i32 v24, v2, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v0, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v1, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v20 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:144 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v20, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v1, 0, 16 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[2:5] offset:128 -; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v21 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:96 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v21 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:112 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:64 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v19, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v1, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v11 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:80 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GFX9-DS128-NEXT: v_bfe_i32 v17, v10, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v2, v9, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[2:5] offset:128 +; GFX9-DS128-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-DS128-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[2:5] offset:144 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(7) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[22:25] offset:176 +; GFX9-DS128-NEXT: v_bfe_i32 v23, v12, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v25, v3, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v13 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[23:26] offset:96 +; GFX9-DS128-NEXT: v_bfe_i32 v23, v4, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v13 +; GFX9-DS128-NEXT: v_bfe_i32 v25, v4, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[23:26] offset:112 +; GFX9-DS128-NEXT: v_bfe_i32 v23, v10, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v25, v4, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v16 +; GFX9-DS128-NEXT: v_bfe_i32 v19, v14, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GFX9-DS128-NEXT: v_bfe_i32 v14, v4, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v11 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v15 +; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v17 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; GFX9-DS128-NEXT: v_bfe_i32 v12, v16, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[23:26] offset:64 +; GFX9-DS128-NEXT: v_bfe_i32 v23, v11, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v25, v4, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v6, v15, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[17:20] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v12, v[6:9] -; GFX9-DS128-NEXT: ds_write_b128 v12, v[2:5] offset:16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[23:26] offset:80 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[12:15] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[0:3] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[19:22] +; GFX9-DS128-NEXT: ds_write_b128 v18, v[6:9] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <32 x i16>, ptr addrspace(3) %in %ext = sext <32 x i16> %load to <32 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir index 8d24f6ba66968..6dd8f041b9912 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -passes=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule %s -o - | FileCheck -check-prefix=GFX908 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908-GCNTRACKERS %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -passes=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers %s -o - | FileCheck -check-prefix=GFX908-GCNTRACKERS %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=0 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -passes=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=0 %s -o - | FileCheck -check-prefix=GFX908 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908-GCNTRACKERS %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -passes=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule %s -o - | FileCheck -check-prefix=GFX908-GCNTRACKERS %s --- name: test_occ_10_max_occ_no_sink diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll index 17581bcb61e99..5f2c9caf45469 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll @@ -1,12 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX900 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10_1 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10_3 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-use-amdgpu-trackers=0 < %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack -amdgpu-use-amdgpu-trackers=0 < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack -amdgpu-use-amdgpu-trackers=0 < %s | FileCheck -check-prefixes=GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+xnack -amdgpu-use-amdgpu-trackers=0 < %s | FileCheck -check-prefixes=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -amdgpu-use-amdgpu-trackers=0 < %s | FileCheck -check-prefix=GFX10_1 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-use-amdgpu-trackers=0 < %s | FileCheck -check-prefix=GFX10_3 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-use-amdgpu-trackers=0 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-use-amdgpu-trackers=0 < %s | FileCheck -check-prefix=GFX12 %s + +; FIXME: GCN Trackers do not track pressure from PhysRegs. This test uses inline +; assembly but GCN trackers do not account for the physical registers, leading to +; out of registers during RA. Hence GCN trackers are disabled for this test. %asm.output = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs <16 x i32>, <7 x i32>, ; vgprs diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 5b7c36559a366..a4c7922430ff9 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -98,59 +98,60 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v24, 0 +; CHECK-NEXT: v_mov_b32_e32 v28, 0 ; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 -; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 -; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 -; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v28, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v28, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v28, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v28, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v28, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v28, s[0:1] offset:32 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v28, s[0:1] offset:16 ; CHECK-NEXT: s_addc_u32 s21, s21, 0 -; CHECK-NEXT: v_mov_b32_e32 v25, s2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:124 -; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:120 -; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:116 -; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:112 -; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 +; CHECK-NEXT: v_mov_b32_e32 v29, s2 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: buffer_store_dword v3, v29, s[20:23], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v2, v29, s[20:23], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v1, v29, s[20:23], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v0, v29, s[20:23], 0 offen offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:108 -; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:104 -; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:100 -; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen offset:96 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] +; CHECK-NEXT: buffer_store_dword v7, v29, s[20:23], 0 offen offset:108 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v28, s[0:1] +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v6, v29, s[20:23], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v5, v29, s[20:23], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v4, v29, s[20:23], 0 offen offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_dword v11, v25, s[20:23], 0 offen offset:92 -; CHECK-NEXT: buffer_store_dword v10, v25, s[20:23], 0 offen offset:88 -; CHECK-NEXT: buffer_store_dword v9, v25, s[20:23], 0 offen offset:84 -; CHECK-NEXT: buffer_store_dword v8, v25, s[20:23], 0 offen offset:80 +; CHECK-NEXT: buffer_store_dword v11, v29, s[20:23], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v10, v29, s[20:23], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v9, v29, s[20:23], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v8, v29, s[20:23], 0 offen offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_dword v15, v25, s[20:23], 0 offen offset:76 -; CHECK-NEXT: buffer_store_dword v14, v25, s[20:23], 0 offen offset:72 -; CHECK-NEXT: buffer_store_dword v13, v25, s[20:23], 0 offen offset:68 -; CHECK-NEXT: buffer_store_dword v12, v25, s[20:23], 0 offen offset:64 +; CHECK-NEXT: buffer_store_dword v15, v29, s[20:23], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v14, v29, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v13, v29, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v12, v29, s[20:23], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_dword v19, v25, s[20:23], 0 offen offset:60 -; CHECK-NEXT: buffer_store_dword v18, v25, s[20:23], 0 offen offset:56 -; CHECK-NEXT: buffer_store_dword v17, v25, s[20:23], 0 offen offset:52 -; CHECK-NEXT: buffer_store_dword v16, v25, s[20:23], 0 offen offset:48 +; CHECK-NEXT: buffer_store_dword v19, v29, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v18, v29, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v17, v29, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v16, v29, s[20:23], 0 offen offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_dword v23, v25, s[20:23], 0 offen offset:44 -; CHECK-NEXT: buffer_store_dword v22, v25, s[20:23], 0 offen offset:40 -; CHECK-NEXT: buffer_store_dword v21, v25, s[20:23], 0 offen offset:36 -; CHECK-NEXT: buffer_store_dword v20, v25, s[20:23], 0 offen offset:32 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:28 -; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:24 -; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:20 -; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:12 -; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen +; CHECK-NEXT: buffer_store_dword v23, v29, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v22, v29, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v21, v29, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v20, v29, s[20:23], 0 offen offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(25) +; CHECK-NEXT: buffer_store_dword v27, v29, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v26, v29, s[20:23], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v25, v29, s[20:23], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v24, v29, s[20:23], 0 offen offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(23) +; CHECK-NEXT: buffer_store_dword v3, v29, s[20:23], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v2, v29, s[20:23], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v1, v29, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v0, v29, s[20:23], 0 offen ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -386,59 +387,60 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v24, 0 +; CHECK-NEXT: v_mov_b32_e32 v28, 0 ; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 -; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 -; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 -; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 -; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v28, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v28, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v28, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v28, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v28, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v28, s[0:1] offset:32 +; CHECK-NEXT: global_load_dwordx4 v[24:27], v28, s[0:1] offset:16 ; CHECK-NEXT: s_addc_u32 s21, s21, 0 -; CHECK-NEXT: v_mov_b32_e32 v25, s2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:124 -; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:120 -; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:116 -; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:112 -; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 +; CHECK-NEXT: v_mov_b32_e32 v29, s2 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: buffer_store_dword v3, v29, s[20:23], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v2, v29, s[20:23], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v1, v29, s[20:23], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v0, v29, s[20:23], 0 offen offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:108 -; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:104 -; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:100 -; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen offset:96 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] +; CHECK-NEXT: buffer_store_dword v7, v29, s[20:23], 0 offen offset:108 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v28, s[0:1] +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v6, v29, s[20:23], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v5, v29, s[20:23], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v4, v29, s[20:23], 0 offen offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_dword v11, v25, s[20:23], 0 offen offset:92 -; CHECK-NEXT: buffer_store_dword v10, v25, s[20:23], 0 offen offset:88 -; CHECK-NEXT: buffer_store_dword v9, v25, s[20:23], 0 offen offset:84 -; CHECK-NEXT: buffer_store_dword v8, v25, s[20:23], 0 offen offset:80 +; CHECK-NEXT: buffer_store_dword v11, v29, s[20:23], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v10, v29, s[20:23], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v9, v29, s[20:23], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v8, v29, s[20:23], 0 offen offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_dword v15, v25, s[20:23], 0 offen offset:76 -; CHECK-NEXT: buffer_store_dword v14, v25, s[20:23], 0 offen offset:72 -; CHECK-NEXT: buffer_store_dword v13, v25, s[20:23], 0 offen offset:68 -; CHECK-NEXT: buffer_store_dword v12, v25, s[20:23], 0 offen offset:64 +; CHECK-NEXT: buffer_store_dword v15, v29, s[20:23], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v14, v29, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v13, v29, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v12, v29, s[20:23], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_dword v19, v25, s[20:23], 0 offen offset:60 -; CHECK-NEXT: buffer_store_dword v18, v25, s[20:23], 0 offen offset:56 -; CHECK-NEXT: buffer_store_dword v17, v25, s[20:23], 0 offen offset:52 -; CHECK-NEXT: buffer_store_dword v16, v25, s[20:23], 0 offen offset:48 +; CHECK-NEXT: buffer_store_dword v19, v29, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v18, v29, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v17, v29, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v16, v29, s[20:23], 0 offen offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_dword v23, v25, s[20:23], 0 offen offset:44 -; CHECK-NEXT: buffer_store_dword v22, v25, s[20:23], 0 offen offset:40 -; CHECK-NEXT: buffer_store_dword v21, v25, s[20:23], 0 offen offset:36 -; CHECK-NEXT: buffer_store_dword v20, v25, s[20:23], 0 offen offset:32 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:28 -; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:24 -; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:20 -; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:12 -; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen +; CHECK-NEXT: buffer_store_dword v23, v29, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v22, v29, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v21, v29, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v20, v29, s[20:23], 0 offen offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(25) +; CHECK-NEXT: buffer_store_dword v27, v29, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v26, v29, s[20:23], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v25, v29, s[20:23], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v24, v29, s[20:23], 0 offen offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(23) +; CHECK-NEXT: buffer_store_dword v3, v29, s[20:23], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v2, v29, s[20:23], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v1, v29, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v0, v29, s[20:23], 0 offen ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index cb68a987c243b..00302390f6788 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -1649,497 +1649,499 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 ; ALIGNED-NEXT: .LBB2_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v64, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v65, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[8:9], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[8:9], off offset:224 -; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[8:9], off offset:208 -; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[8:9], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[8:9], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[8:9], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[8:9], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[8:9], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[8:9], off offset:112 -; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[8:9], off offset:96 -; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[8:9], off offset:80 -; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[8:9], off offset:64 -; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[8:9], off offset:48 -; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[8:9], off offset:32 -; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[8:9], off -; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[8:9], off offset:16 -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v84, 6 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v85, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v86, vcc_lo, v84, 3 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v87, null, 0, v85, vcc_lo +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[4:5], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[4:5], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[4:5], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[4:5], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[4:5], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[4:5], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[4:5], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[4:5], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; ALIGNED-NEXT: v_add_co_u32 v66, vcc_lo, v64, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v67, null, 0, v65, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v68, vcc_lo, v64, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v69, null, 0, v65, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:248 -; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:249 -; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:245 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:240 -; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:241 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:236 -; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:248 -; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:246 -; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:252 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v82 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v82 offset:244 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v83 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v83 offset:248 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:248 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v81 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v81 offset:240 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:246 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v80 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v80 offset:236 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 -; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:250 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 -; ALIGNED-NEXT: flat_store_byte v[86:87], v116 offset:244 -; ALIGNED-NEXT: flat_store_byte v[86:87], v117 offset:242 -; ALIGNED-NEXT: flat_store_byte v[86:87], v118 offset:240 -; ALIGNED-NEXT: flat_store_byte v[86:87], v119 offset:238 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:232 -; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:233 -; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:229 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:224 -; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:225 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:220 -; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v86 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v86 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v87 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:242 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v85 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:238 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v85 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v84 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:236 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:232 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v83 -; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:230 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:236 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 -; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:234 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:228 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:226 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v98 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v87 offset:232 +; ALIGNED-NEXT: flat_store_byte v[66:67], v87 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v98 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v99 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v86 offset:228 +; ALIGNED-NEXT: flat_store_byte v[66:67], v86 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v99 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v97 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v97 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v85 offset:224 +; ALIGNED-NEXT: flat_store_byte v[66:67], v85 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v96 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v96 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:222 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:222 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:216 -; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:217 -; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:213 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:208 -; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:209 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:204 -; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v69 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:216 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v68 -; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:214 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:220 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v102 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v84 offset:220 +; ALIGNED-NEXT: flat_store_byte v[66:67], v84 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v102 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:216 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v103 +; ALIGNED-NEXT: flat_store_byte v[66:67], v87 offset:214 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v103 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:220 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v101 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v99 offset:216 +; ALIGNED-NEXT: flat_store_byte v[66:67], v99 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v101 +; ALIGNED-NEXT: flat_store_byte v[66:67], v86 offset:218 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v98 offset:212 +; ALIGNED-NEXT: flat_store_byte v[66:67], v98 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v100 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:212 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v66 -; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:218 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:212 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:210 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:206 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:200 -; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:201 -; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:197 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:192 -; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:193 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:188 -; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v114 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:210 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v114 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v97 offset:208 +; ALIGNED-NEXT: flat_store_byte v[66:67], v97 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v115 +; ALIGNED-NEXT: flat_store_byte v[66:67], v85 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v115 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:206 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v96 offset:204 +; ALIGNED-NEXT: flat_store_byte v[66:67], v96 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v113 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:200 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v112 +; ALIGNED-NEXT: flat_store_byte v[66:67], v84 offset:198 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v112 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v103 offset:200 +; ALIGNED-NEXT: flat_store_byte v[66:67], v103 offset:201 ; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:204 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v54 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:200 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 -; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:198 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:204 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v52 -; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:202 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:196 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v53 -; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:194 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:192 +; ALIGNED-NEXT: flat_store_byte v[66:67], v87 offset:202 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v55 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v102 offset:196 +; ALIGNED-NEXT: flat_store_byte v[66:67], v102 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v55 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:196 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[66:67], v99 offset:194 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v101 offset:192 +; ALIGNED-NEXT: flat_store_byte v[66:67], v101 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[66:67], v86 offset:192 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[66:67], v98 offset:190 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:190 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v50 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:184 -; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:185 -; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:181 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:176 -; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:177 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:172 -; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:184 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 -; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:182 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:188 -; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:186 -; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:180 -; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:178 -; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:176 -; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:174 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v50 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v100 offset:188 +; ALIGNED-NEXT: flat_store_byte v[66:67], v100 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v50 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:184 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v51 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:182 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[66:67], v97 offset:188 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v49 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v115 offset:184 +; ALIGNED-NEXT: flat_store_byte v[66:67], v115 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[66:67], v85 offset:186 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v114 offset:180 +; ALIGNED-NEXT: flat_store_byte v[66:67], v114 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:180 +; ALIGNED-NEXT: flat_store_byte v[66:67], v96 offset:178 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v113 offset:176 +; ALIGNED-NEXT: flat_store_byte v[66:67], v113 offset:177 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v39 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:176 +; ALIGNED-NEXT: flat_store_byte v[66:67], v84 offset:174 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v112 offset:172 +; ALIGNED-NEXT: flat_store_byte v[66:67], v112 offset:173 ; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:164 -; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:165 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:168 -; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:169 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:156 -; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:157 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:160 -; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:161 -; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[66:67], v103 offset:166 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:168 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v54 offset:164 +; ALIGNED-NEXT: flat_store_byte v[66:67], v54 offset:165 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:168 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v35 -; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:166 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:172 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 -; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:170 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:160 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:158 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:164 -; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:162 +; ALIGNED-NEXT: flat_store_byte v[66:67], v87 offset:172 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[66:67], v102 offset:170 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v35 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v55 offset:168 +; ALIGNED-NEXT: flat_store_byte v[66:67], v55 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v33 +; ALIGNED-NEXT: flat_store_byte v[66:67], v99 offset:158 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v33 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v52 offset:156 +; ALIGNED-NEXT: flat_store_byte v[66:67], v52 offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[66:67], v101 offset:164 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[66:67], v86 offset:162 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v53 offset:160 +; ALIGNED-NEXT: flat_store_byte v[66:67], v53 offset:161 ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:152 +; ALIGNED-NEXT: flat_store_byte v[66:67], v98 offset:152 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v31 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v50 offset:148 +; ALIGNED-NEXT: flat_store_byte v[66:67], v50 offset:149 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v28 ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] -; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:150 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v30 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:152 -; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:153 -; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:149 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:144 -; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:145 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:140 -; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:141 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[66:67], v100 offset:150 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:156 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v51 offset:152 +; ALIGNED-NEXT: flat_store_byte v[66:67], v51 offset:153 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:156 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:154 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[66:67], v97 offset:148 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v26 -; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:154 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:148 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v27 -; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:146 -; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:144 -; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:142 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[66:67], v115 offset:146 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v26 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v49 offset:144 +; ALIGNED-NEXT: flat_store_byte v[66:67], v49 offset:145 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[66:67], v85 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[66:67], v114 offset:142 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v48 offset:140 +; ALIGNED-NEXT: flat_store_byte v[66:67], v48 offset:141 ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:216 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v27 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:136 -; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:137 -; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:133 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:128 -; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:129 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:124 -; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:136 -; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:134 -; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:140 -; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:138 -; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:132 -; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:130 -; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:126 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v18 +; ALIGNED-NEXT: flat_store_byte v[66:67], v113 offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v22 +; ALIGNED-NEXT: flat_store_byte v[66:67], v103 offset:126 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v25 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:136 +; ALIGNED-NEXT: flat_store_byte v[66:67], v96 offset:134 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v39 offset:136 +; ALIGNED-NEXT: flat_store_byte v[66:67], v39 offset:137 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:138 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v38 offset:132 +; ALIGNED-NEXT: flat_store_byte v[66:67], v38 offset:133 +; ALIGNED-NEXT: flat_store_byte v[66:67], v84 offset:132 +; ALIGNED-NEXT: flat_store_byte v[66:67], v112 offset:130 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v37 offset:128 +; ALIGNED-NEXT: flat_store_byte v[66:67], v37 offset:129 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:128 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v36 offset:124 +; ALIGNED-NEXT: flat_store_byte v[64:65], v36 offset:128 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v18 ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:120 -; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:121 -; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:117 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:112 -; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:113 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:108 -; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:109 -; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:120 -; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:118 -; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:122 -; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:116 -; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:114 -; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:112 -; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:110 +; ALIGNED-NEXT: flat_store_byte v[66:67], v54 offset:120 +; ALIGNED-NEXT: flat_store_byte v[66:67], v87 offset:118 +; ALIGNED-NEXT: flat_store_byte v[66:67], v102 offset:124 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v35 offset:120 +; ALIGNED-NEXT: flat_store_byte v[66:67], v35 offset:121 +; ALIGNED-NEXT: flat_store_byte v[66:67], v55 offset:122 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v34 offset:116 +; ALIGNED-NEXT: flat_store_byte v[66:67], v34 offset:117 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[66:67], v99 offset:114 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v33 offset:112 +; ALIGNED-NEXT: flat_store_byte v[66:67], v33 offset:113 +; ALIGNED-NEXT: flat_store_byte v[66:67], v52 offset:112 +; ALIGNED-NEXT: flat_store_byte v[66:67], v101 offset:110 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v32 offset:108 +; ALIGNED-NEXT: flat_store_byte v[66:67], v32 offset:109 ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:104 -; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:105 -; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:101 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:96 -; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:97 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:92 -; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:93 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v14 +; ALIGNED-NEXT: flat_store_byte v[66:67], v98 offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[66:67], v50 offset:94 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v14 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v31 offset:104 +; ALIGNED-NEXT: flat_store_byte v[66:67], v31 offset:105 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:94 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v28 offset:92 +; ALIGNED-NEXT: flat_store_byte v[66:67], v28 offset:93 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v6 ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v15 -; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:104 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:102 -; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:108 -; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:106 -; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:100 -; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[66:67], v86 offset:104 +; ALIGNED-NEXT: flat_store_byte v[66:67], v53 offset:102 +; ALIGNED-NEXT: flat_store_byte v[66:67], v100 offset:106 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v11 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v30 offset:100 +; ALIGNED-NEXT: flat_store_byte v[66:67], v30 offset:101 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:100 +; ALIGNED-NEXT: flat_store_byte v[66:67], v51 offset:98 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v29 offset:96 +; ALIGNED-NEXT: flat_store_byte v[66:67], v29 offset:97 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:96 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:8 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:12 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:84 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:88 -; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:89 -; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:85 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:80 -; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:81 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:76 -; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:77 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v20 +; ALIGNED-NEXT: flat_store_byte v[66:67], v97 offset:88 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v17 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[66:67], v115 offset:86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[66:67], v49 offset:92 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v5 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v27 offset:88 +; ALIGNED-NEXT: flat_store_byte v[66:67], v27 offset:89 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v5 -; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:88 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v4 -; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:92 -; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:90 -; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:84 -; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:82 -; ALIGNED-NEXT: flat_store_byte v[86:87], v36 offset:80 -; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:78 +; ALIGNED-NEXT: flat_store_byte v[66:67], v85 offset:90 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v26 offset:84 +; ALIGNED-NEXT: flat_store_byte v[66:67], v26 offset:85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[66:67], v114 offset:84 +; ALIGNED-NEXT: flat_store_byte v[66:67], v48 offset:82 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v25 offset:80 +; ALIGNED-NEXT: flat_store_byte v[66:67], v25 offset:81 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:80 +; ALIGNED-NEXT: flat_store_byte v[66:67], v96 offset:78 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v24 offset:76 +; ALIGNED-NEXT: flat_store_byte v[66:67], v24 offset:77 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:24 ; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:28 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:20 ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:68 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:72 -; ALIGNED-NEXT: flat_store_byte v[86:87], v23 offset:73 -; ALIGNED-NEXT: flat_store_byte v[86:87], v22 offset:69 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:64 -; ALIGNED-NEXT: flat_store_byte v[86:87], v21 offset:65 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:60 -; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64 -; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:72 -; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:70 -; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:76 -; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:74 -; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:68 -; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:66 -; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:64 -; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:62 +; ALIGNED-NEXT: flat_store_byte v[66:67], v113 offset:72 +; ALIGNED-NEXT: flat_store_byte v[66:67], v39 offset:70 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:76 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v23 offset:72 +; ALIGNED-NEXT: flat_store_byte v[66:67], v23 offset:73 +; ALIGNED-NEXT: flat_store_byte v[66:67], v38 offset:74 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte v[66:67], v22 offset:69 +; ALIGNED-NEXT: flat_store_byte v[66:67], v84 offset:68 +; ALIGNED-NEXT: flat_store_byte v[66:67], v112 offset:66 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v21 offset:64 +; ALIGNED-NEXT: flat_store_byte v[66:67], v21 offset:65 +; ALIGNED-NEXT: flat_store_byte v[66:67], v37 offset:64 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:62 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v20 offset:60 +; ALIGNED-NEXT: flat_store_byte v[64:65], v20 offset:64 ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:56 -; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:54 -; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:60 -; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:58 -; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:52 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:52 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:56 -; ALIGNED-NEXT: flat_store_byte v[86:87], v19 offset:57 -; ALIGNED-NEXT: flat_store_byte v[86:87], v18 offset:53 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:48 -; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:50 -; ALIGNED-NEXT: flat_store_byte v[86:87], v17 offset:49 -; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:44 -; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:46 -; ALIGNED-NEXT: flat_store_byte v[86:87], v16 offset:45 +; ALIGNED-NEXT: flat_store_byte v[66:67], v103 offset:56 +; ALIGNED-NEXT: flat_store_byte v[66:67], v36 offset:54 +; ALIGNED-NEXT: flat_store_byte v[66:67], v54 offset:60 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v19 offset:56 +; ALIGNED-NEXT: flat_store_byte v[66:67], v19 offset:57 +; ALIGNED-NEXT: flat_store_byte v[66:67], v87 offset:58 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v18 offset:52 +; ALIGNED-NEXT: flat_store_byte v[66:67], v18 offset:53 +; ALIGNED-NEXT: flat_store_byte v[66:67], v102 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v17 offset:48 +; ALIGNED-NEXT: flat_store_byte v[66:67], v35 offset:50 +; ALIGNED-NEXT: flat_store_byte v[66:67], v17 offset:49 +; ALIGNED-NEXT: flat_store_byte v[66:67], v55 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v16 offset:44 +; ALIGNED-NEXT: flat_store_byte v[66:67], v34 offset:46 +; ALIGNED-NEXT: flat_store_byte v[66:67], v16 offset:45 ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:36 -; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:40 -; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:38 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:40 -; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:44 -; ALIGNED-NEXT: flat_store_byte v[86:87], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:42 -; ALIGNED-NEXT: flat_store_byte v[86:87], v14 offset:37 -; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:36 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:32 -; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:34 -; ALIGNED-NEXT: flat_store_byte v[86:87], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:28 -; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:30 -; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v14 offset:36 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:40 +; ALIGNED-NEXT: flat_store_byte v[66:67], v99 offset:38 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte v[66:67], v33 offset:44 +; ALIGNED-NEXT: flat_store_byte v[66:67], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[66:67], v52 offset:42 +; ALIGNED-NEXT: flat_store_byte v[66:67], v14 offset:37 +; ALIGNED-NEXT: flat_store_byte v[66:67], v101 offset:36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte v[66:67], v32 offset:34 +; ALIGNED-NEXT: flat_store_byte v[66:67], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[66:67], v86 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v12 offset:28 +; ALIGNED-NEXT: flat_store_byte v[66:67], v53 offset:30 +; ALIGNED-NEXT: flat_store_byte v[64:65], v12 offset:32 ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:20 -; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:24 -; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:22 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:24 -; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:28 -; ALIGNED-NEXT: flat_store_byte v[86:87], v11 offset:25 -; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:26 -; ALIGNED-NEXT: flat_store_byte v[86:87], v10 offset:21 -; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:20 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:16 -; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:18 -; ALIGNED-NEXT: flat_store_byte v[86:87], v9 offset:17 -; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:16 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:12 -; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:14 -; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v10 offset:20 +; ALIGNED-NEXT: flat_store_byte v[66:67], v98 offset:24 +; ALIGNED-NEXT: flat_store_byte v[66:67], v31 offset:22 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v11 offset:24 +; ALIGNED-NEXT: flat_store_byte v[66:67], v100 offset:28 +; ALIGNED-NEXT: flat_store_byte v[66:67], v11 offset:25 +; ALIGNED-NEXT: flat_store_byte v[66:67], v30 offset:26 +; ALIGNED-NEXT: flat_store_byte v[66:67], v10 offset:21 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:20 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v9 offset:16 +; ALIGNED-NEXT: flat_store_byte v[66:67], v51 offset:18 +; ALIGNED-NEXT: flat_store_byte v[66:67], v9 offset:17 +; ALIGNED-NEXT: flat_store_byte v[66:67], v29 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v8 offset:12 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:14 +; ALIGNED-NEXT: flat_store_byte v[64:65], v8 offset:16 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:4 -; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:8 -; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:6 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:8 -; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:12 -; ALIGNED-NEXT: flat_store_byte v[86:87], v7 offset:9 -; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:10 -; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:4 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 -; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:2 -; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[86:87], v64 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[84:85], v65 offset:1 -; ALIGNED-NEXT: flat_store_byte v[84:85], v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v6 offset:4 +; ALIGNED-NEXT: flat_store_byte v[66:67], v50 offset:8 +; ALIGNED-NEXT: flat_store_byte v[66:67], v28 offset:6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v7 offset:8 +; ALIGNED-NEXT: flat_store_byte v[66:67], v97 offset:12 +; ALIGNED-NEXT: flat_store_byte v[66:67], v7 offset:9 +; ALIGNED-NEXT: flat_store_byte v[66:67], v115 offset:10 +; ALIGNED-NEXT: flat_store_byte v[64:65], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[66:67], v49 offset:4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v5 +; ALIGNED-NEXT: flat_store_byte v[66:67], v27 offset:2 +; ALIGNED-NEXT: flat_store_byte v[64:65], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[66:67], v85 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[64:65], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[64:65], v26 offset:1 +; ALIGNED-NEXT: flat_store_byte v[64:65], v4 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB2_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) @@ -3689,14 +3691,14 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:168 ; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:164 ; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:160 -; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156 -; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152 -; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148 -; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:144 -; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:140 -; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136 -; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132 -; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:128 ; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8 @@ -3725,10 +3727,10 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(16) ; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:112 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(11) +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128 ; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:64 ; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:48 ; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32 @@ -8640,16 +8642,16 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v48, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v49, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:224 ; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 -; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[4:5], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[4:5], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[4:5], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:144 ; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[4:5], off offset:128 ; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[4:5], off offset:112 ; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[4:5], off offset:96 @@ -8659,477 +8661,477 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 ; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 ; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off -; ALIGNED-NEXT: v_add_co_u32 v86, vcc_lo, v84, 6 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v87, null, 0, v85, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v84, 3 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v85, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v52, vcc_lo, v48, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v53, null, 0, v49, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v50, vcc_lo, v48, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v51, null, 0, v49, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v100 offset:244 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v101 offset:248 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:249 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v99 offset:240 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:241 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v98 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v98 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v66 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v66 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v67 offset:248 +; ALIGNED-NEXT: flat_store_byte v[50:51], v66 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[50:51], v67 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[50:51], v54 offset:248 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v65 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v65 offset:240 +; ALIGNED-NEXT: flat_store_byte v[50:51], v65 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[50:51], v55 offset:246 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v64 offset:236 +; ALIGNED-NEXT: flat_store_byte v[50:51], v64 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[50:51], v66 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:248 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:246 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:252 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:250 -; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:244 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 -; ALIGNED-NEXT: flat_store_byte v[96:97], v117 offset:242 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 -; ALIGNED-NEXT: flat_store_byte v[96:97], v118 offset:240 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 -; ALIGNED-NEXT: flat_store_byte v[96:97], v119 offset:238 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v114 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v115 offset:232 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:229 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v113 offset:224 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:225 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v112 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[50:51], v67 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v70 +; ALIGNED-NEXT: flat_store_byte v[50:51], v54 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[50:51], v65 offset:242 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[50:51], v55 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[50:51], v64 offset:238 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v69 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: flat_store_byte v[50:51], v66 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[50:51], v67 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v68 +; ALIGNED-NEXT: flat_store_byte v[50:51], v54 offset:236 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:232 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v83 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:230 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:236 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:234 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:228 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:226 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v82 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v71 offset:232 +; ALIGNED-NEXT: flat_store_byte v[50:51], v71 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[50:51], v65 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v83 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v70 offset:228 +; ALIGNED-NEXT: flat_store_byte v[50:51], v70 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v83 +; ALIGNED-NEXT: flat_store_byte v[50:51], v55 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[50:51], v64 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v81 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v69 offset:224 +; ALIGNED-NEXT: flat_store_byte v[50:51], v69 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[50:51], v66 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[50:51], v67 offset:222 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:222 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v86 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v68 offset:220 +; ALIGNED-NEXT: flat_store_byte v[50:51], v68 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v86 ; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:136 ; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:132 ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v82 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v83 offset:216 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:217 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:213 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v81 offset:208 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:209 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v80 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v69 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:216 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v68 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:214 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:220 +; ALIGNED-NEXT: flat_store_byte v[50:51], v54 offset:216 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v87 +; ALIGNED-NEXT: flat_store_byte v[50:51], v71 offset:214 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[50:51], v65 offset:220 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v85 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v83 offset:216 +; ALIGNED-NEXT: flat_store_byte v[50:51], v83 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[50:51], v70 offset:218 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v82 offset:212 +; ALIGNED-NEXT: flat_store_byte v[50:51], v82 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[50:51], v55 offset:212 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:218 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:212 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:210 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:206 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v70 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v71 offset:200 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:201 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:197 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v69 offset:192 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:193 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v68 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v98 +; ALIGNED-NEXT: flat_store_byte v[50:51], v64 offset:210 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v98 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v81 offset:208 +; ALIGNED-NEXT: flat_store_byte v[50:51], v81 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v99 +; ALIGNED-NEXT: flat_store_byte v[50:51], v69 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v99 +; ALIGNED-NEXT: flat_store_byte v[50:51], v66 offset:206 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v97 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v80 offset:204 +; ALIGNED-NEXT: flat_store_byte v[50:51], v80 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v97 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: flat_store_byte v[50:51], v67 offset:200 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v96 +; ALIGNED-NEXT: flat_store_byte v[50:51], v68 offset:198 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v96 +; ALIGNED-NEXT: flat_store_byte v[50:51], v54 offset:204 ; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v54 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:200 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:198 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:204 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:202 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:196 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:194 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:192 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v102 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v87 offset:200 +; ALIGNED-NEXT: flat_store_byte v[50:51], v87 offset:201 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v102 +; ALIGNED-NEXT: flat_store_byte v[50:51], v71 offset:202 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v103 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v86 offset:196 +; ALIGNED-NEXT: flat_store_byte v[50:51], v86 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v103 +; ALIGNED-NEXT: flat_store_byte v[50:51], v65 offset:196 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v100 +; ALIGNED-NEXT: flat_store_byte v[50:51], v83 offset:194 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v85 offset:192 +; ALIGNED-NEXT: flat_store_byte v[50:51], v85 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v101 +; ALIGNED-NEXT: flat_store_byte v[50:51], v70 offset:192 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v101 +; ALIGNED-NEXT: flat_store_byte v[50:51], v82 offset:190 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:190 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v50 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v66 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v67 offset:184 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:185 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:181 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v65 offset:176 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:177 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v64 offset:172 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:184 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:182 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:188 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v114 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v84 offset:188 +; ALIGNED-NEXT: flat_store_byte v[50:51], v84 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v114 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: flat_store_byte v[50:51], v55 offset:184 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v115 +; ALIGNED-NEXT: flat_store_byte v[50:51], v64 offset:182 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v115 +; ALIGNED-NEXT: flat_store_byte v[50:51], v81 offset:188 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v99 offset:184 +; ALIGNED-NEXT: flat_store_byte v[50:51], v99 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v113 +; ALIGNED-NEXT: flat_store_byte v[50:51], v69 offset:186 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v112 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v98 offset:180 +; ALIGNED-NEXT: flat_store_byte v[50:51], v98 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v112 +; ALIGNED-NEXT: flat_store_byte v[50:51], v66 offset:180 ; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:186 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:180 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:178 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:176 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 +; ALIGNED-NEXT: flat_store_byte v[50:51], v80 offset:178 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v38 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v97 offset:176 +; ALIGNED-NEXT: flat_store_byte v[50:51], v97 offset:177 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v39 +; ALIGNED-NEXT: flat_store_byte v[50:51], v67 offset:176 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[50:51], v68 offset:174 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:174 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v37 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v54 offset:164 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:165 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v55 offset:168 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:169 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v52 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:157 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v53 offset:160 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v96 offset:172 +; ALIGNED-NEXT: flat_store_byte v[50:51], v96 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v37 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[50:51], v54 offset:168 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v36 +; ALIGNED-NEXT: flat_store_byte v[50:51], v87 offset:166 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v102 offset:164 +; ALIGNED-NEXT: flat_store_byte v[50:51], v102 offset:165 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:168 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v35 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:166 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:172 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:170 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:160 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:158 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:164 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:162 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:152 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v34 +; ALIGNED-NEXT: flat_store_byte v[50:51], v71 offset:172 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[50:51], v86 offset:170 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v35 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v103 offset:168 +; ALIGNED-NEXT: flat_store_byte v[50:51], v103 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[50:51], v65 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v33 +; ALIGNED-NEXT: flat_store_byte v[50:51], v83 offset:158 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v33 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v100 offset:156 +; ALIGNED-NEXT: flat_store_byte v[50:51], v100 offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[50:51], v85 offset:164 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[50:51], v70 offset:162 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v101 offset:160 +; ALIGNED-NEXT: flat_store_byte v[50:51], v101 offset:161 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: flat_store_byte v[50:51], v82 offset:152 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:150 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v30 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v50 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v51 offset:152 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:149 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v49 offset:144 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:145 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v48 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:141 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:156 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v31 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v114 offset:148 +; ALIGNED-NEXT: flat_store_byte v[50:51], v114 offset:149 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[50:51], v84 offset:150 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[50:51], v55 offset:156 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v29 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v115 offset:152 +; ALIGNED-NEXT: flat_store_byte v[50:51], v115 offset:153 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[50:51], v64 offset:154 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[50:51], v81 offset:148 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:154 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:148 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:146 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[50:51], v99 offset:146 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v26 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v113 offset:144 +; ALIGNED-NEXT: flat_store_byte v[50:51], v113 offset:145 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[50:51], v69 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[50:51], v98 offset:142 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v25 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:142 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v25 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v112 offset:140 +; ALIGNED-NEXT: flat_store_byte v[50:51], v112 offset:141 ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:216 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v38 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v39 offset:136 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:133 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v37 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:129 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v36 offset:124 -; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[50:51], v66 offset:136 +; ALIGNED-NEXT: flat_store_byte v[50:51], v80 offset:134 +; ALIGNED-NEXT: flat_store_byte v[50:51], v97 offset:140 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:136 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:138 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:132 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:126 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v22 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v39 offset:136 +; ALIGNED-NEXT: flat_store_byte v[50:51], v39 offset:137 +; ALIGNED-NEXT: flat_store_byte v[50:51], v67 offset:138 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v38 offset:132 +; ALIGNED-NEXT: flat_store_byte v[50:51], v38 offset:133 +; ALIGNED-NEXT: flat_store_byte v[50:51], v68 offset:132 +; ALIGNED-NEXT: flat_store_byte v[50:51], v96 offset:130 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v37 offset:128 +; ALIGNED-NEXT: flat_store_byte v[50:51], v37 offset:129 +; ALIGNED-NEXT: flat_store_byte v[50:51], v54 offset:128 +; ALIGNED-NEXT: flat_store_byte v[50:51], v87 offset:126 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v18 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v36 offset:124 +; ALIGNED-NEXT: flat_store_byte v[48:49], v36 offset:128 ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v34 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v35 offset:120 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:117 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v33 offset:112 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:113 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v32 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:109 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:120 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:122 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:116 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:112 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:110 +; ALIGNED-NEXT: flat_store_byte v[50:51], v102 offset:120 +; ALIGNED-NEXT: flat_store_byte v[50:51], v71 offset:118 +; ALIGNED-NEXT: flat_store_byte v[50:51], v86 offset:124 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v35 offset:120 +; ALIGNED-NEXT: flat_store_byte v[50:51], v35 offset:121 +; ALIGNED-NEXT: flat_store_byte v[50:51], v103 offset:122 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v34 offset:116 +; ALIGNED-NEXT: flat_store_byte v[50:51], v34 offset:117 +; ALIGNED-NEXT: flat_store_byte v[50:51], v65 offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[50:51], v83 offset:114 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v33 offset:112 +; ALIGNED-NEXT: flat_store_byte v[50:51], v33 offset:113 +; ALIGNED-NEXT: flat_store_byte v[50:51], v100 offset:112 +; ALIGNED-NEXT: flat_store_byte v[50:51], v85 offset:110 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v32 offset:108 +; ALIGNED-NEXT: flat_store_byte v[50:51], v32 offset:109 ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v30 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v31 offset:104 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:105 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:101 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v29 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:97 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v28 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:93 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:96 +; ALIGNED-NEXT: flat_store_byte v[50:51], v82 offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[50:51], v114 offset:94 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v25 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v14 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v31 offset:104 +; ALIGNED-NEXT: flat_store_byte v[50:51], v31 offset:105 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:94 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v28 offset:92 +; ALIGNED-NEXT: flat_store_byte v[50:51], v28 offset:93 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v6 ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:104 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:106 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:100 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[50:51], v70 offset:104 +; ALIGNED-NEXT: flat_store_byte v[50:51], v101 offset:102 +; ALIGNED-NEXT: flat_store_byte v[50:51], v84 offset:106 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v11 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v30 offset:100 +; ALIGNED-NEXT: flat_store_byte v[50:51], v30 offset:101 +; ALIGNED-NEXT: flat_store_byte v[50:51], v55 offset:100 +; ALIGNED-NEXT: flat_store_byte v[50:51], v115 offset:98 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v29 offset:96 +; ALIGNED-NEXT: flat_store_byte v[50:51], v29 offset:97 +; ALIGNED-NEXT: flat_store_byte v[50:51], v64 offset:96 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:8 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:12 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:88 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v17 +; ALIGNED-NEXT: flat_store_byte v[50:51], v81 offset:88 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v17 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:92 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:90 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v4 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v26 offset:84 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v27 offset:88 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:89 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:85 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v25 offset:80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:80 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v24 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:77 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[50:51], v99 offset:86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[50:51], v113 offset:92 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v5 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v27 offset:88 +; ALIGNED-NEXT: flat_store_byte v[50:51], v27 offset:89 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[50:51], v69 offset:90 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v26 offset:84 +; ALIGNED-NEXT: flat_store_byte v[50:51], v26 offset:85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[50:51], v98 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v25 offset:80 +; ALIGNED-NEXT: flat_store_byte v[50:51], v112 offset:82 +; ALIGNED-NEXT: flat_store_byte v[50:51], v25 offset:81 +; ALIGNED-NEXT: flat_store_byte v[50:51], v66 offset:80 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v24 offset:76 +; ALIGNED-NEXT: flat_store_byte v[50:51], v80 offset:78 +; ALIGNED-NEXT: flat_store_byte v[50:51], v24 offset:77 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:24 ; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:28 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:20 ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v22 offset:68 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:72 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:70 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v23 offset:72 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:73 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:74 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:68 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v21 offset:64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:64 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v20 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:62 -; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte v[50:51], v97 offset:72 +; ALIGNED-NEXT: flat_store_byte v[50:51], v39 offset:70 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v23 offset:72 +; ALIGNED-NEXT: flat_store_byte v[50:51], v67 offset:76 +; ALIGNED-NEXT: flat_store_byte v[50:51], v23 offset:73 +; ALIGNED-NEXT: flat_store_byte v[50:51], v38 offset:74 +; ALIGNED-NEXT: flat_store_byte v[50:51], v22 offset:69 +; ALIGNED-NEXT: flat_store_byte v[50:51], v68 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v21 offset:64 +; ALIGNED-NEXT: flat_store_byte v[50:51], v96 offset:66 +; ALIGNED-NEXT: flat_store_byte v[50:51], v21 offset:65 +; ALIGNED-NEXT: flat_store_byte v[50:51], v37 offset:64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v20 offset:60 +; ALIGNED-NEXT: flat_store_byte v[50:51], v54 offset:62 +; ALIGNED-NEXT: flat_store_byte v[48:49], v20 offset:64 ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v18 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:54 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v19 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:57 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:52 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v17 offset:48 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v16 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:45 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v18 offset:52 +; ALIGNED-NEXT: flat_store_byte v[50:51], v87 offset:56 +; ALIGNED-NEXT: flat_store_byte v[50:51], v36 offset:54 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v19 offset:56 +; ALIGNED-NEXT: flat_store_byte v[50:51], v102 offset:60 +; ALIGNED-NEXT: flat_store_byte v[50:51], v19 offset:57 +; ALIGNED-NEXT: flat_store_byte v[50:51], v71 offset:58 +; ALIGNED-NEXT: flat_store_byte v[50:51], v18 offset:53 +; ALIGNED-NEXT: flat_store_byte v[50:51], v86 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v17 offset:48 +; ALIGNED-NEXT: flat_store_byte v[50:51], v35 offset:50 +; ALIGNED-NEXT: flat_store_byte v[50:51], v17 offset:49 +; ALIGNED-NEXT: flat_store_byte v[50:51], v103 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v16 offset:44 +; ALIGNED-NEXT: flat_store_byte v[50:51], v34 offset:46 +; ALIGNED-NEXT: flat_store_byte v[50:51], v16 offset:45 ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v14 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:38 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v15 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:36 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v13 offset:32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v12 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:30 -; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v14 offset:36 +; ALIGNED-NEXT: flat_store_byte v[50:51], v65 offset:40 +; ALIGNED-NEXT: flat_store_byte v[50:51], v83 offset:38 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte v[50:51], v33 offset:44 +; ALIGNED-NEXT: flat_store_byte v[50:51], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[50:51], v100 offset:42 +; ALIGNED-NEXT: flat_store_byte v[50:51], v14 offset:37 +; ALIGNED-NEXT: flat_store_byte v[50:51], v85 offset:36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte v[50:51], v32 offset:34 +; ALIGNED-NEXT: flat_store_byte v[50:51], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[50:51], v70 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v12 offset:28 +; ALIGNED-NEXT: flat_store_byte v[50:51], v101 offset:30 +; ALIGNED-NEXT: flat_store_byte v[48:49], v12 offset:32 ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v10 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:22 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v11 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:25 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:20 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v9 offset:16 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:16 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v8 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:14 -; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v10 offset:20 +; ALIGNED-NEXT: flat_store_byte v[50:51], v82 offset:24 +; ALIGNED-NEXT: flat_store_byte v[50:51], v31 offset:22 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v11 offset:24 +; ALIGNED-NEXT: flat_store_byte v[50:51], v84 offset:28 +; ALIGNED-NEXT: flat_store_byte v[50:51], v11 offset:25 +; ALIGNED-NEXT: flat_store_byte v[50:51], v30 offset:26 +; ALIGNED-NEXT: flat_store_byte v[50:51], v10 offset:21 +; ALIGNED-NEXT: flat_store_byte v[50:51], v55 offset:20 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v9 offset:16 +; ALIGNED-NEXT: flat_store_byte v[50:51], v115 offset:18 +; ALIGNED-NEXT: flat_store_byte v[50:51], v9 offset:17 +; ALIGNED-NEXT: flat_store_byte v[50:51], v29 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v8 offset:12 +; ALIGNED-NEXT: flat_store_byte v[50:51], v64 offset:14 +; ALIGNED-NEXT: flat_store_byte v[48:49], v8 offset:16 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v6 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:6 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v7 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:10 -; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:4 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:2 -; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[84:85], v99 offset:1 -; ALIGNED-NEXT: flat_store_byte v[84:85], v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v6 offset:4 +; ALIGNED-NEXT: flat_store_byte v[50:51], v114 offset:8 +; ALIGNED-NEXT: flat_store_byte v[50:51], v28 offset:6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v7 offset:8 +; ALIGNED-NEXT: flat_store_byte v[50:51], v81 offset:12 +; ALIGNED-NEXT: flat_store_byte v[50:51], v7 offset:9 +; ALIGNED-NEXT: flat_store_byte v[50:51], v99 offset:10 +; ALIGNED-NEXT: flat_store_byte v[48:49], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[50:51], v113 offset:4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[52:53], v5 +; ALIGNED-NEXT: flat_store_byte v[50:51], v27 offset:2 +; ALIGNED-NEXT: flat_store_byte v[48:49], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[50:51], v69 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[48:49], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[48:49], v26 offset:1 +; ALIGNED-NEXT: flat_store_byte v[48:49], v4 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB7_2 ; ALIGNED-NEXT: .LBB7_3: ; %Flow6 ; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 @@ -9142,14 +9144,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v64, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v65, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:224 -; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 -; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[4:5], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[4:5], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:176 ; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160 ; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[4:5], off offset:144 ; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[4:5], off offset:128 @@ -9161,476 +9163,477 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 ; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 ; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v84, 6 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v85, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v86, vcc_lo, v84, 3 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v87, null, 0, v85, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v66, vcc_lo, v64, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v67, null, 0, v65, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v68, vcc_lo, v64, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v69, null, 0, v65, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:248 -; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:249 -; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:245 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:240 -; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:241 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:236 -; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:248 -; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:246 -; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:252 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v82 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v82 offset:244 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:245 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v83 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v83 offset:248 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:249 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:248 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v81 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v81 offset:240 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:241 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:246 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v80 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v80 offset:236 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 -; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:250 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 -; ALIGNED-NEXT: flat_store_byte v[86:87], v116 offset:244 -; ALIGNED-NEXT: flat_store_byte v[86:87], v117 offset:242 -; ALIGNED-NEXT: flat_store_byte v[86:87], v118 offset:240 -; ALIGNED-NEXT: flat_store_byte v[86:87], v119 offset:238 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:232 -; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:233 -; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:229 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:224 -; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:225 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:220 -; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v86 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v86 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v87 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:242 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v85 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:238 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v85 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v84 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:236 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:232 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v83 -; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:230 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:236 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 -; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:234 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:228 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:226 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v98 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v87 offset:232 +; ALIGNED-NEXT: flat_store_byte v[66:67], v87 offset:233 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v98 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v99 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v86 offset:228 +; ALIGNED-NEXT: flat_store_byte v[66:67], v86 offset:229 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v99 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v97 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v97 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v85 offset:224 +; ALIGNED-NEXT: flat_store_byte v[66:67], v85 offset:225 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v96 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v96 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:222 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:222 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:216 -; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:217 -; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:213 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:208 -; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:209 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:204 -; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v69 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:216 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v68 -; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:214 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:220 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v102 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v84 offset:220 +; ALIGNED-NEXT: flat_store_byte v[66:67], v84 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v102 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:216 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v103 +; ALIGNED-NEXT: flat_store_byte v[66:67], v87 offset:214 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v103 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:220 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v101 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v99 offset:216 +; ALIGNED-NEXT: flat_store_byte v[66:67], v99 offset:217 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v101 +; ALIGNED-NEXT: flat_store_byte v[66:67], v86 offset:218 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v98 offset:212 +; ALIGNED-NEXT: flat_store_byte v[66:67], v98 offset:213 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v100 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:212 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v66 -; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:218 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:212 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:210 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:206 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:200 -; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:201 -; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:197 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:192 -; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:193 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:188 -; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v114 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:210 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v114 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v97 offset:208 +; ALIGNED-NEXT: flat_store_byte v[66:67], v97 offset:209 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v115 +; ALIGNED-NEXT: flat_store_byte v[66:67], v85 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v115 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:206 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v96 offset:204 +; ALIGNED-NEXT: flat_store_byte v[66:67], v96 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v113 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:200 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v112 +; ALIGNED-NEXT: flat_store_byte v[66:67], v84 offset:198 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v112 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v103 offset:200 +; ALIGNED-NEXT: flat_store_byte v[66:67], v103 offset:201 ; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:204 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v54 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:200 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 -; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:198 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:204 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v52 -; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:202 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:196 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v53 -; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:194 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:192 +; ALIGNED-NEXT: flat_store_byte v[66:67], v87 offset:202 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v55 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v102 offset:196 +; ALIGNED-NEXT: flat_store_byte v[66:67], v102 offset:197 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v55 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:196 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[66:67], v99 offset:194 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v101 offset:192 +; ALIGNED-NEXT: flat_store_byte v[66:67], v101 offset:193 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[66:67], v86 offset:192 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[66:67], v98 offset:190 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:190 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v50 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:184 -; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:185 -; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:181 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:176 -; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:177 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:172 -; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:184 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 -; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:182 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:188 -; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:186 -; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:180 -; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:178 -; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:176 -; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:174 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v50 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v100 offset:188 +; ALIGNED-NEXT: flat_store_byte v[66:67], v100 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v50 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:184 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v51 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:182 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[66:67], v97 offset:188 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v49 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v115 offset:184 +; ALIGNED-NEXT: flat_store_byte v[66:67], v115 offset:185 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[66:67], v85 offset:186 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v114 offset:180 +; ALIGNED-NEXT: flat_store_byte v[66:67], v114 offset:181 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:180 +; ALIGNED-NEXT: flat_store_byte v[66:67], v96 offset:178 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v113 offset:176 +; ALIGNED-NEXT: flat_store_byte v[66:67], v113 offset:177 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v39 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:176 +; ALIGNED-NEXT: flat_store_byte v[66:67], v84 offset:174 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v112 offset:172 +; ALIGNED-NEXT: flat_store_byte v[66:67], v112 offset:173 ; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:508 ; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:504 ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:500 ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:164 -; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:165 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:168 -; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:169 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:156 -; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:157 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:160 -; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:161 -; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[66:67], v103 offset:166 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:168 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v54 offset:164 +; ALIGNED-NEXT: flat_store_byte v[66:67], v54 offset:165 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:168 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v35 -; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:166 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:172 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 -; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:170 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:160 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:158 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:164 -; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:162 +; ALIGNED-NEXT: flat_store_byte v[66:67], v87 offset:172 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[66:67], v102 offset:170 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v35 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v55 offset:168 +; ALIGNED-NEXT: flat_store_byte v[66:67], v55 offset:169 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v33 +; ALIGNED-NEXT: flat_store_byte v[66:67], v99 offset:158 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v33 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v52 offset:156 +; ALIGNED-NEXT: flat_store_byte v[66:67], v52 offset:157 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[66:67], v101 offset:164 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[66:67], v86 offset:162 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v53 offset:160 +; ALIGNED-NEXT: flat_store_byte v[66:67], v53 offset:161 ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:456 ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:460 ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:452 ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:152 +; ALIGNED-NEXT: flat_store_byte v[66:67], v98 offset:152 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v28 -; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:150 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v30 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:152 -; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:153 -; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:149 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:144 -; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:145 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:140 -; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:141 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v31 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v50 offset:148 +; ALIGNED-NEXT: flat_store_byte v[66:67], v50 offset:149 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[66:67], v100 offset:150 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:156 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v51 offset:152 +; ALIGNED-NEXT: flat_store_byte v[66:67], v51 offset:153 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:156 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:154 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[66:67], v97 offset:148 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v26 -; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:154 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:148 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v27 -; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:146 -; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:144 -; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:142 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[66:67], v115 offset:146 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v26 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v49 offset:144 +; ALIGNED-NEXT: flat_store_byte v[66:67], v49 offset:145 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[66:67], v85 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[66:67], v114 offset:142 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v48 offset:140 +; ALIGNED-NEXT: flat_store_byte v[66:67], v48 offset:141 ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:472 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:468 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v27 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:136 -; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:137 -; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:133 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:128 -; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:129 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:124 -; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:136 -; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:134 -; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:140 -; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:138 -; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:132 -; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:130 -; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:126 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v18 +; ALIGNED-NEXT: flat_store_byte v[66:67], v113 offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v22 +; ALIGNED-NEXT: flat_store_byte v[66:67], v103 offset:126 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v25 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:136 +; ALIGNED-NEXT: flat_store_byte v[66:67], v96 offset:134 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v39 offset:136 +; ALIGNED-NEXT: flat_store_byte v[66:67], v39 offset:137 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:138 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v38 offset:132 +; ALIGNED-NEXT: flat_store_byte v[66:67], v38 offset:133 +; ALIGNED-NEXT: flat_store_byte v[66:67], v84 offset:132 +; ALIGNED-NEXT: flat_store_byte v[66:67], v112 offset:130 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v37 offset:128 +; ALIGNED-NEXT: flat_store_byte v[66:67], v37 offset:129 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:128 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v36 offset:124 +; ALIGNED-NEXT: flat_store_byte v[64:65], v36 offset:128 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v18 ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:120 -; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:121 -; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:117 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:112 -; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:113 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:108 -; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:109 -; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:120 -; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:118 -; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:122 -; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:116 -; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:114 -; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:112 -; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:110 +; ALIGNED-NEXT: flat_store_byte v[66:67], v54 offset:120 +; ALIGNED-NEXT: flat_store_byte v[66:67], v87 offset:118 +; ALIGNED-NEXT: flat_store_byte v[66:67], v102 offset:124 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v35 offset:120 +; ALIGNED-NEXT: flat_store_byte v[66:67], v35 offset:121 +; ALIGNED-NEXT: flat_store_byte v[66:67], v55 offset:122 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v34 offset:116 +; ALIGNED-NEXT: flat_store_byte v[66:67], v34 offset:117 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:116 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[66:67], v99 offset:114 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v33 offset:112 +; ALIGNED-NEXT: flat_store_byte v[66:67], v33 offset:113 +; ALIGNED-NEXT: flat_store_byte v[66:67], v52 offset:112 +; ALIGNED-NEXT: flat_store_byte v[66:67], v101 offset:110 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v32 offset:108 +; ALIGNED-NEXT: flat_store_byte v[66:67], v32 offset:109 ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:104 -; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:105 -; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:101 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:96 -; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:97 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:92 -; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:93 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:96 +; ALIGNED-NEXT: flat_store_byte v[66:67], v98 offset:108 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[66:67], v50 offset:94 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v14 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v31 offset:104 +; ALIGNED-NEXT: flat_store_byte v[66:67], v31 offset:105 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:94 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v28 offset:92 +; ALIGNED-NEXT: flat_store_byte v[66:67], v28 offset:93 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v6 ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v15 -; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:104 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:102 -; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:108 -; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:106 -; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:100 -; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[66:67], v86 offset:104 +; ALIGNED-NEXT: flat_store_byte v[66:67], v53 offset:102 +; ALIGNED-NEXT: flat_store_byte v[66:67], v100 offset:106 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v11 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v30 offset:100 +; ALIGNED-NEXT: flat_store_byte v[66:67], v30 offset:101 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:100 +; ALIGNED-NEXT: flat_store_byte v[66:67], v51 offset:98 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v29 offset:96 +; ALIGNED-NEXT: flat_store_byte v[66:67], v29 offset:97 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:96 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:268 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:260 ; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:84 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:88 -; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:89 -; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:85 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:80 -; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:81 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:76 -; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:77 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v20 +; ALIGNED-NEXT: flat_store_byte v[66:67], v97 offset:88 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v17 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[66:67], v115 offset:86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[66:67], v49 offset:92 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v5 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v27 offset:88 +; ALIGNED-NEXT: flat_store_byte v[66:67], v27 offset:89 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v5 -; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:88 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v4 -; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:92 -; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:90 -; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:84 -; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:82 -; ALIGNED-NEXT: flat_store_byte v[86:87], v36 offset:80 -; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:78 +; ALIGNED-NEXT: flat_store_byte v[66:67], v85 offset:90 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v26 offset:84 +; ALIGNED-NEXT: flat_store_byte v[66:67], v26 offset:85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[66:67], v114 offset:84 +; ALIGNED-NEXT: flat_store_byte v[66:67], v48 offset:82 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v25 offset:80 +; ALIGNED-NEXT: flat_store_byte v[66:67], v25 offset:81 +; ALIGNED-NEXT: flat_store_byte v[66:67], v82 offset:80 +; ALIGNED-NEXT: flat_store_byte v[66:67], v96 offset:78 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v24 offset:76 +; ALIGNED-NEXT: flat_store_byte v[66:67], v24 offset:77 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:68 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:72 -; ALIGNED-NEXT: flat_store_byte v[86:87], v23 offset:73 -; ALIGNED-NEXT: flat_store_byte v[86:87], v22 offset:69 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:64 -; ALIGNED-NEXT: flat_store_byte v[86:87], v21 offset:65 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:60 -; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64 -; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:72 -; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:70 -; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:76 -; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:74 -; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:68 -; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:66 -; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:64 -; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:62 +; ALIGNED-NEXT: flat_store_byte v[66:67], v113 offset:72 +; ALIGNED-NEXT: flat_store_byte v[66:67], v39 offset:70 +; ALIGNED-NEXT: flat_store_byte v[66:67], v83 offset:76 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v23 offset:72 +; ALIGNED-NEXT: flat_store_byte v[66:67], v23 offset:73 +; ALIGNED-NEXT: flat_store_byte v[66:67], v38 offset:74 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte v[66:67], v22 offset:69 +; ALIGNED-NEXT: flat_store_byte v[66:67], v84 offset:68 +; ALIGNED-NEXT: flat_store_byte v[66:67], v112 offset:66 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v21 offset:64 +; ALIGNED-NEXT: flat_store_byte v[66:67], v21 offset:65 +; ALIGNED-NEXT: flat_store_byte v[66:67], v37 offset:64 +; ALIGNED-NEXT: flat_store_byte v[66:67], v70 offset:62 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v20 offset:60 +; ALIGNED-NEXT: flat_store_byte v[64:65], v20 offset:64 ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:360 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:364 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:356 ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:56 -; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:54 -; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:60 -; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:58 -; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:52 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:52 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:56 -; ALIGNED-NEXT: flat_store_byte v[86:87], v19 offset:57 -; ALIGNED-NEXT: flat_store_byte v[86:87], v18 offset:53 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:48 -; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:50 -; ALIGNED-NEXT: flat_store_byte v[86:87], v17 offset:49 -; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:44 -; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:46 -; ALIGNED-NEXT: flat_store_byte v[86:87], v16 offset:45 +; ALIGNED-NEXT: flat_store_byte v[66:67], v103 offset:56 +; ALIGNED-NEXT: flat_store_byte v[66:67], v36 offset:54 +; ALIGNED-NEXT: flat_store_byte v[66:67], v54 offset:60 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v19 offset:56 +; ALIGNED-NEXT: flat_store_byte v[66:67], v19 offset:57 +; ALIGNED-NEXT: flat_store_byte v[66:67], v87 offset:58 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v18 offset:52 +; ALIGNED-NEXT: flat_store_byte v[66:67], v18 offset:53 +; ALIGNED-NEXT: flat_store_byte v[66:67], v102 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v17 offset:48 +; ALIGNED-NEXT: flat_store_byte v[66:67], v35 offset:50 +; ALIGNED-NEXT: flat_store_byte v[66:67], v17 offset:49 +; ALIGNED-NEXT: flat_store_byte v[66:67], v55 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v16 offset:44 +; ALIGNED-NEXT: flat_store_byte v[66:67], v34 offset:46 +; ALIGNED-NEXT: flat_store_byte v[66:67], v16 offset:45 ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:36 -; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:40 -; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:38 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:40 -; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:44 -; ALIGNED-NEXT: flat_store_byte v[86:87], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:42 -; ALIGNED-NEXT: flat_store_byte v[86:87], v14 offset:37 -; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:36 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:32 -; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:34 -; ALIGNED-NEXT: flat_store_byte v[86:87], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:28 -; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:30 -; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v14 offset:36 +; ALIGNED-NEXT: flat_store_byte v[66:67], v81 offset:40 +; ALIGNED-NEXT: flat_store_byte v[66:67], v99 offset:38 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte v[66:67], v33 offset:44 +; ALIGNED-NEXT: flat_store_byte v[66:67], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[66:67], v52 offset:42 +; ALIGNED-NEXT: flat_store_byte v[66:67], v14 offset:37 +; ALIGNED-NEXT: flat_store_byte v[66:67], v101 offset:36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte v[66:67], v32 offset:34 +; ALIGNED-NEXT: flat_store_byte v[66:67], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[66:67], v86 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v12 offset:28 +; ALIGNED-NEXT: flat_store_byte v[66:67], v53 offset:30 +; ALIGNED-NEXT: flat_store_byte v[64:65], v12 offset:32 ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328 ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:332 ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:324 ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:20 -; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:24 -; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:22 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:24 -; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:28 -; ALIGNED-NEXT: flat_store_byte v[86:87], v11 offset:25 -; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:26 -; ALIGNED-NEXT: flat_store_byte v[86:87], v10 offset:21 -; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:20 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:16 -; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:18 -; ALIGNED-NEXT: flat_store_byte v[86:87], v9 offset:17 -; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:16 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:12 -; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:14 -; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v10 offset:20 +; ALIGNED-NEXT: flat_store_byte v[66:67], v98 offset:24 +; ALIGNED-NEXT: flat_store_byte v[66:67], v31 offset:22 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v11 offset:24 +; ALIGNED-NEXT: flat_store_byte v[66:67], v100 offset:28 +; ALIGNED-NEXT: flat_store_byte v[66:67], v11 offset:25 +; ALIGNED-NEXT: flat_store_byte v[66:67], v30 offset:26 +; ALIGNED-NEXT: flat_store_byte v[66:67], v10 offset:21 +; ALIGNED-NEXT: flat_store_byte v[66:67], v71 offset:20 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v9 offset:16 +; ALIGNED-NEXT: flat_store_byte v[66:67], v51 offset:18 +; ALIGNED-NEXT: flat_store_byte v[66:67], v9 offset:17 +; ALIGNED-NEXT: flat_store_byte v[66:67], v29 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v8 offset:12 +; ALIGNED-NEXT: flat_store_byte v[66:67], v80 offset:14 +; ALIGNED-NEXT: flat_store_byte v[64:65], v8 offset:16 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:4 -; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:8 -; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:6 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:8 -; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:12 -; ALIGNED-NEXT: flat_store_byte v[86:87], v7 offset:9 -; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:10 -; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:4 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 -; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:2 -; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[86:87], v64 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[84:85], v65 offset:1 -; ALIGNED-NEXT: flat_store_byte v[84:85], v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v6 offset:4 +; ALIGNED-NEXT: flat_store_byte v[66:67], v50 offset:8 +; ALIGNED-NEXT: flat_store_byte v[66:67], v28 offset:6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v7 offset:8 +; ALIGNED-NEXT: flat_store_byte v[66:67], v97 offset:12 +; ALIGNED-NEXT: flat_store_byte v[66:67], v7 offset:9 +; ALIGNED-NEXT: flat_store_byte v[66:67], v115 offset:10 +; ALIGNED-NEXT: flat_store_byte v[64:65], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[66:67], v49 offset:4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[68:69], v5 +; ALIGNED-NEXT: flat_store_byte v[66:67], v27 offset:2 +; ALIGNED-NEXT: flat_store_byte v[64:65], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[66:67], v85 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[64:65], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[64:65], v26 offset:1 +; ALIGNED-NEXT: flat_store_byte v[64:65], v4 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB7_5 ; ALIGNED-NEXT: .LBB7_6: ; %Flow7 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 @@ -12696,14 +12699,14 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168 ; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164 ; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160 -; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156 -; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152 -; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148 -; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:144 -; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:140 -; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136 -; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132 -; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128 +; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:140 +; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:156 +; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:152 +; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:148 +; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:144 +; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:136 +; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:132 +; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:128 ; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8 @@ -12731,10 +12734,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(16) ; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:112 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(11) +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128 ; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:64 ; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:48 ; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll index 9585c486aeb9e..bdc92a11057d4 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll @@ -375,19 +375,18 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr ; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a5 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a4 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:16 ; GFX908-NEXT: s_endpgm bb: %acc = call i32 asm sideeffect "; def $0", "={a0}"() @@ -481,19 +480,18 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add ; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a5 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a4 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:16 ; GFX908-NEXT: s_endpgm bb: call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> poison) @@ -587,19 +585,18 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr ; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a5 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a4 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] -; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:16 ; GFX908-NEXT: s_endpgm bb: %acc = call i32 asm sideeffect "; def $0", "={v0}"() @@ -715,19 +712,18 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) ; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a5 ; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a4 ; GFX908-NEXT: global_store_dwordx4 v40, v[4:7], s[34:35] offset:112 ; GFX908-NEXT: global_store_dwordx4 v40, v[8:11], s[34:35] offset:64 ; GFX908-NEXT: global_store_dwordx4 v40, v[12:15], s[34:35] offset:80 ; GFX908-NEXT: global_store_dwordx4 v40, v[16:19], s[34:35] offset:32 ; GFX908-NEXT: global_store_dwordx4 v40, v[20:23], s[34:35] offset:48 ; GFX908-NEXT: global_store_dwordx4 v40, v[24:27], s[34:35] -; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:16 +; GFX908-NEXT: global_store_dwordx4 v40, v[28:31], s[34:35] offset:16 ; GFX908-NEXT: s_endpgm bb: call void @foo() @@ -1000,19 +996,18 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v33, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a6 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a4 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:32 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:48 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[26:29], off -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:16 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] bb: @@ -1101,19 +1096,18 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v33, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v32, a6 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96 -; GFX908-NEXT: s_nop 0 -; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 -; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 -; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 -; GFX908-NEXT: v_accvgpr_read_b32 v2, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v31, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v30, a4 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:32 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:48 ; GFX908-NEXT: global_store_dwordx4 v[0:1], v[26:29], off -; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[30:33], off offset:16 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 4681d589ac217..038201f819bfc 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -3382,21 +3382,21 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_lo_u32 v3, v4, v3 ; VI-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v4, v2, 0 -; VI-NEXT: v_mul_lo_u32 v2, v5, v2 -; VI-NEXT: v_mul_lo_u32 v10, v7, v0 +; VI-NEXT: v_mul_lo_u32 v15, v7, v0 ; VI-NEXT: v_mad_u64_u32 v[7:8], s[0:1], v0, v4, 0 -; VI-NEXT: v_add_u32_e32 v3, vcc, v14, v3 -; VI-NEXT: v_add_u32_e32 v14, vcc, v3, v2 +; VI-NEXT: v_mul_lo_u32 v10, v5, v2 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v3 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[8:9] +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v10 ; VI-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v6, v0, v[13:14] ; VI-NEXT: v_mov_b32_e32 v8, v2 ; VI-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v5, v[8:9] -; VI-NEXT: v_mul_lo_u32 v4, v6, v1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v10, v14 +; VI-NEXT: v_add_u32_e32 v4, vcc, v15, v14 ; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v9 +; VI-NEXT: v_mul_lo_u32 v16, v6, v1 ; VI-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[2:3] -; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v6 +; VI-NEXT: v_add_u32_e32 v2, vcc, v16, v4 ; VI-NEXT: v_add_u32_e32 v9, vcc, v0, v13 ; VI-NEXT: v_addc_u32_e32 v10, vcc, v1, v2, vcc ; VI-NEXT: flat_store_dwordx4 v[11:12], v[7:10] @@ -3422,10 +3422,10 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX9-NEXT: v_mul_lo_u32 v4, v6, v1 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v10 -; GFX9-NEXT: v_mul_lo_u32 v14, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v16, v7, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] -; GFX9-NEXT: v_add3_u32 v3, v14, v3, v4 +; GFX9-NEXT: v_add3_u32 v3, v16, v3, v4 ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc ; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index 33cd598aae9b5..1a31eec2568a6 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -73,22 +73,22 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9-NEXT: .LBB1_2: ; %bb23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0 +; GFX9-NEXT: v_add_u32_e32 v18, v9, v0 ; GFX9-NEXT: v_add_u32_e32 v12, v17, v0 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_madak_f32 v3, v3, v7, 0x3727c5ac ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_u32_u24_e32 v18, v3, v5 -; GFX9-NEXT: v_add_u32_e32 v19, v3, v16 -; GFX9-NEXT: v_add_u32_e32 v3, v9, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v18 -; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v19, v13 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v19, v15, v[3:4] +; GFX9-NEXT: v_mul_u32_u24_e32 v19, v3, v5 +; GFX9-NEXT: v_add_u32_e32 v20, v3, v16 +; GFX9-NEXT: v_sub_u32_e32 v3, v18, v19 +; GFX9-NEXT: v_sub_u32_e32 v12, v12, v19 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v20, v15, v[3:4] +; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v20, v13 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v12, v14 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v18, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[18:19], 2, v[3:4] -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_co_u32_e64 v18, s[6:7], v10, v18 ; GFX9-NEXT: v_addc_co_u32_e64 v19, s[6:7], v11, v19, s[6:7] ; GFX9-NEXT: global_load_dword v3, v[18:19], off diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 6b45d31da0e95..db4e7d2025b29 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -184,64 +184,66 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900-LABEL: fadd_v32_vs: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 7, v0 ; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 -; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 -; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 -; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] -; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 -; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 -; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 -; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_add_f32_e32 v4, s43, v4 -; GFX900-NEXT: v_add_f32_e32 v3, s42, v3 -; GFX900-NEXT: v_add_f32_e32 v2, s41, v2 -; GFX900-NEXT: v_add_f32_e32 v1, s40, v1 -; GFX900-NEXT: v_add_f32_e32 v32, s19, v32 -; GFX900-NEXT: v_add_f32_e32 v31, s18, v31 -; GFX900-NEXT: v_add_f32_e32 v30, s17, v30 -; GFX900-NEXT: v_add_f32_e32 v29, s16, v29 -; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_add_f32_e32 v8, s39, v8 -; GFX900-NEXT: v_add_f32_e32 v7, s38, v7 -; GFX900-NEXT: v_add_f32_e32 v6, s37, v6 -; GFX900-NEXT: v_add_f32_e32 v5, s36, v5 +; GFX900-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:96 +; GFX900-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64 +; GFX900-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:80 ; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_f32_e32 v12, s51, v12 -; GFX900-NEXT: v_add_f32_e32 v11, s50, v11 -; GFX900-NEXT: v_add_f32_e32 v10, s49, v10 -; GFX900-NEXT: v_add_f32_e32 v9, s48, v9 -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_f32_e32 v16, s47, v16 -; GFX900-NEXT: v_add_f32_e32 v15, s46, v15 -; GFX900-NEXT: v_add_f32_e32 v14, s45, v14 -; GFX900-NEXT: v_add_f32_e32 v13, s44, v13 +; GFX900-NEXT: v_add_f32_e32 v3, s23, v3 +; GFX900-NEXT: v_add_f32_e32 v2, s22, v2 ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_f32_e32 v20, s15, v20 -; GFX900-NEXT: v_add_f32_e32 v19, s14, v19 -; GFX900-NEXT: v_add_f32_e32 v18, s13, v18 -; GFX900-NEXT: v_add_f32_e32 v17, s12, v17 +; GFX900-NEXT: v_add_f32_e32 v11, s11, v11 +; GFX900-NEXT: v_add_f32_e32 v10, s10, v10 +; GFX900-NEXT: v_add_f32_e32 v9, s9, v9 +; GFX900-NEXT: v_add_f32_e32 v8, s8, v8 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v15, s15, v15 +; GFX900-NEXT: v_add_f32_e32 v14, s14, v14 +; GFX900-NEXT: v_add_f32_e32 v13, s13, v13 +; GFX900-NEXT: v_add_f32_e32 v12, s12, v12 +; GFX900-NEXT: v_add_f32_e32 v1, s21, v1 +; GFX900-NEXT: v_add_f32_e32 v0, s20, v0 +; GFX900-NEXT: v_add_f32_e32 v7, s19, v7 +; GFX900-NEXT: v_add_f32_e32 v6, s18, v6 +; GFX900-NEXT: v_add_f32_e32 v5, s17, v5 +; GFX900-NEXT: v_add_f32_e32 v4, s16, v4 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:64 +; GFX900-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:48 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:96 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:112 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:80 +; GFX900-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GFX900-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xa4 +; GFX900-NEXT: s_waitcnt vmcnt(6) lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v11, s23, v11 +; GFX900-NEXT: v_add_f32_e32 v10, s22, v10 +; GFX900-NEXT: v_add_f32_e32 v9, s21, v9 +; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: v_add_f32_e32 v15, s19, v15 +; GFX900-NEXT: v_add_f32_e32 v14, s18, v14 +; GFX900-NEXT: v_add_f32_e32 v13, s17, v13 +; GFX900-NEXT: v_add_f32_e32 v12, s16, v12 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_f32_e32 v24, s11, v24 -; GFX900-NEXT: v_add_f32_e32 v23, s10, v23 -; GFX900-NEXT: v_add_f32_e32 v22, s9, v22 -; GFX900-NEXT: v_add_f32_e32 v21, s8, v21 -; GFX900-NEXT: v_add_f32_e32 v28, s23, v28 -; GFX900-NEXT: v_add_f32_e32 v27, s22, v27 -; GFX900-NEXT: v_add_f32_e32 v26, s21, v26 -; GFX900-NEXT: v_add_f32_e32 v25, s20, v25 -; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96 -; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112 -; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64 -; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80 -; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32 -; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48 -; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] -; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16 +; GFX900-NEXT: v_add_f32_e32 v7, s15, v7 +; GFX900-NEXT: v_add_f32_e32 v6, s14, v6 +; GFX900-NEXT: v_add_f32_e32 v5, s13, v5 +; GFX900-NEXT: v_add_f32_e32 v4, s12, v4 +; GFX900-NEXT: v_add_f32_e32 v3, s11, v3 +; GFX900-NEXT: v_add_f32_e32 v2, s10, v2 +; GFX900-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX900-NEXT: v_add_f32_e32 v0, s8, v0 +; GFX900-NEXT: v_add_f32_e32 v8, s20, v8 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:32 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:48 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GFX900-NEXT: s_endpgm ; ; PACKED-SDAG-LABEL: fadd_v32_vs: @@ -350,61 +352,62 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40 +; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v60, 7, v0 :: v_dual_mov_b32 v32, s40 ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16 -; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 -; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 -; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1] -; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:80 -; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 -; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 -; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v60, s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v60, s[0:1] +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v60, s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v60, s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v60, s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v60, s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v60, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v60, s[0:1] offset:96 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v56, s16 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v57, s17 :: v_dual_mov_b32 v58, s18 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v59, s19 :: v_dual_mov_b32 v51, s11 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v41, s51 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v43, s45 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v44, s46 :: v_dual_mov_b32 v37, s39 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v39, s49 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v40, s50 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s16 :: v_dual_mov_b32 v33, s17 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[40:41] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[8:9] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[38:39] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[36:37] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[8:9] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[36:37] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[46:47] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[32:33] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[34:35] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[50:51] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1 +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[52:53] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[52:53] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[54:55] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[50:51] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[40:41] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[46:47] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[56:57] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[58:59] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[54:55] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[34:35] ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[48:49] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[42:43] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[44:45] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[36:37] -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[38:39] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[42:43] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[44:45] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[36:37] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[38:39] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[40:41] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[32:33] ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:96 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:64 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:80 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1] -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[28:31], s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[24:27], s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[20:23], s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[16:19], s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[12:15], s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[8:11], s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[4:7], s[0:1] +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[0:3], s[0:1] offset:16 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fadd_v32_vs: @@ -415,17 +418,17 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 ; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v58, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1] -; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16 -; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 -; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:48 -; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:64 -; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 -; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 -; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v58, s[0:1] +; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v58, s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v58, s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v58, s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v58, s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v58, s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v58, s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v58, s[0:1] offset:112 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] @@ -438,16 +441,16 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[56:57], s[16:17] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[20:21] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[36:37] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[22:23] ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[38:39] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[40:41] ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[42:43] @@ -461,20 +464,20 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[52:53] ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[54:55] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[32:33] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[34:35] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[56:57] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[32:33] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[36:37] -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[38:39] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[34:35] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[36:37] ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1] -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:48 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:64 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:80 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:96 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[0:3], s[0:1] +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[4:7], s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[8:11], s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[12:15], s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[16:19], s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[20:23], s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[24:27], s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[28:31], s[0:1] offset:112 ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id @@ -1443,64 +1446,66 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900-LABEL: fmul_v32_vs: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 7, v0 ; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 -; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 -; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 -; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] -; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 -; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 -; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 -; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4 -; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3 -; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2 -; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1 -; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32 -; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31 -; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30 -; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29 -; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8 -; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7 -; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6 -; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5 +; GFX900-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:96 +; GFX900-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64 +; GFX900-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:80 ; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12 -; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11 -; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10 -; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9 -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16 -; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15 -; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14 -; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13 +; GFX900-NEXT: v_mul_f32_e32 v3, s23, v3 +; GFX900-NEXT: v_mul_f32_e32 v2, s22, v2 ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20 -; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19 -; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18 -; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17 +; GFX900-NEXT: v_mul_f32_e32 v11, s11, v11 +; GFX900-NEXT: v_mul_f32_e32 v10, s10, v10 +; GFX900-NEXT: v_mul_f32_e32 v9, s9, v9 +; GFX900-NEXT: v_mul_f32_e32 v8, s8, v8 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v15, s15, v15 +; GFX900-NEXT: v_mul_f32_e32 v14, s14, v14 +; GFX900-NEXT: v_mul_f32_e32 v13, s13, v13 +; GFX900-NEXT: v_mul_f32_e32 v12, s12, v12 +; GFX900-NEXT: v_mul_f32_e32 v1, s21, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, s20, v0 +; GFX900-NEXT: v_mul_f32_e32 v7, s19, v7 +; GFX900-NEXT: v_mul_f32_e32 v6, s18, v6 +; GFX900-NEXT: v_mul_f32_e32 v5, s17, v5 +; GFX900-NEXT: v_mul_f32_e32 v4, s16, v4 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:64 +; GFX900-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:48 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:96 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:112 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:80 +; GFX900-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GFX900-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xa4 +; GFX900-NEXT: s_waitcnt vmcnt(6) lgkmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v11, s23, v11 +; GFX900-NEXT: v_mul_f32_e32 v10, s22, v10 +; GFX900-NEXT: v_mul_f32_e32 v9, s21, v9 +; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: v_mul_f32_e32 v15, s19, v15 +; GFX900-NEXT: v_mul_f32_e32 v14, s18, v14 +; GFX900-NEXT: v_mul_f32_e32 v13, s17, v13 +; GFX900-NEXT: v_mul_f32_e32 v12, s16, v12 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24 -; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23 -; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22 -; GFX900-NEXT: v_mul_f32_e32 v21, s8, v21 -; GFX900-NEXT: v_mul_f32_e32 v28, s23, v28 -; GFX900-NEXT: v_mul_f32_e32 v27, s22, v27 -; GFX900-NEXT: v_mul_f32_e32 v26, s21, v26 -; GFX900-NEXT: v_mul_f32_e32 v25, s20, v25 -; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96 -; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112 -; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64 -; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80 -; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32 -; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48 -; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] -; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16 +; GFX900-NEXT: v_mul_f32_e32 v7, s15, v7 +; GFX900-NEXT: v_mul_f32_e32 v6, s14, v6 +; GFX900-NEXT: v_mul_f32_e32 v5, s13, v5 +; GFX900-NEXT: v_mul_f32_e32 v4, s12, v4 +; GFX900-NEXT: v_mul_f32_e32 v3, s11, v3 +; GFX900-NEXT: v_mul_f32_e32 v2, s10, v2 +; GFX900-NEXT: v_mul_f32_e32 v1, s9, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, s8, v0 +; GFX900-NEXT: v_mul_f32_e32 v8, s20, v8 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:32 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:48 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GFX900-NEXT: s_endpgm ; ; PACKED-SDAG-LABEL: fmul_v32_vs: @@ -1609,61 +1614,62 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40 +; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v60, 7, v0 :: v_dual_mov_b32 v32, s40 ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16 -; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 -; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 -; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1] -; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:80 -; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 -; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 -; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v60, s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v60, s[0:1] +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v60, s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v60, s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v60, s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v60, s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v60, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v60, s[0:1] offset:96 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v56, s16 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v57, s17 :: v_dual_mov_b32 v58, s18 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v59, s19 :: v_dual_mov_b32 v51, s11 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v41, s51 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v43, s45 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v44, s46 :: v_dual_mov_b32 v37, s39 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v39, s49 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v40, s50 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s16 :: v_dual_mov_b32 v33, s17 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[40:41] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[8:9] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[38:39] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[36:37] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[8:9] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[36:37] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[46:47] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[32:33] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[34:35] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[50:51] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1 +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[52:53] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[52:53] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[54:55] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[50:51] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[40:41] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[46:47] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[56:57] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[58:59] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[54:55] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[34:35] ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[48:49] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[42:43] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[44:45] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[36:37] -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[38:39] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[42:43] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[44:45] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[36:37] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[38:39] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[40:41] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[32:33] ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:96 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:64 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:80 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1] -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[28:31], s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[24:27], s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[20:23], s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[16:19], s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[12:15], s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[8:11], s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[4:7], s[0:1] +; GFX1250-SDAG-NEXT: global_store_b128 v60, v[0:3], s[0:1] offset:16 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fmul_v32_vs: @@ -1674,17 +1680,17 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 ; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v58, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1] -; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16 -; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 -; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:48 -; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:64 -; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 -; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 -; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v58, s[0:1] +; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v58, s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v58, s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v58, s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v58, s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v58, s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v58, s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v58, s[0:1] offset:112 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] @@ -1697,16 +1703,16 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[56:57], s[16:17] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[20:21] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[36:37] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[22:23] ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[38:39] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[40:41] ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[42:43] @@ -1720,20 +1726,20 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[52:53] ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[54:55] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[32:33] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[34:35] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[56:57] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[32:33] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[36:37] -; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[38:39] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[34:35] +; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[36:37] ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1] -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:48 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:64 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:80 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:96 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[0:3], s[0:1] +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[4:7], s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[8:11], s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[12:15], s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[16:19], s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[20:23], s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[24:27], s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[28:31], s[0:1] offset:112 ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id @@ -2275,64 +2281,66 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900-LABEL: fma_v32_vs: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 7, v0 ; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 -; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 -; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 -; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] -; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 -; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 -; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 -; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43 -; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42 -; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41 -; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40 -; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19 -; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18 -; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17 -; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16 -; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39 -; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38 -; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37 -; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36 +; GFX900-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:96 +; GFX900-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64 +; GFX900-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:80 ; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51 -; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50 -; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49 -; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48 -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47 -; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46 -; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45 -; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44 +; GFX900-NEXT: v_fma_f32 v3, v3, s23, s23 +; GFX900-NEXT: v_fma_f32 v2, v2, s22, s22 ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15 -; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14 -; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13 -; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12 +; GFX900-NEXT: v_fma_f32 v11, v11, s11, s11 +; GFX900-NEXT: v_fma_f32 v10, v10, s10, s10 +; GFX900-NEXT: v_fma_f32 v9, v9, s9, s9 +; GFX900-NEXT: v_fma_f32 v8, v8, s8, s8 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11 -; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10 -; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9 -; GFX900-NEXT: v_fma_f32 v21, v21, s8, s8 -; GFX900-NEXT: v_fma_f32 v28, v28, s23, s23 -; GFX900-NEXT: v_fma_f32 v27, v27, s22, s22 -; GFX900-NEXT: v_fma_f32 v26, v26, s21, s21 -; GFX900-NEXT: v_fma_f32 v25, v25, s20, s20 -; GFX900-NEXT: global_store_dwordx4 v0, v[29:32], s[0:1] offset:96 -; GFX900-NEXT: global_store_dwordx4 v0, v[25:28], s[0:1] offset:112 -; GFX900-NEXT: global_store_dwordx4 v0, v[21:24], s[0:1] offset:64 -; GFX900-NEXT: global_store_dwordx4 v0, v[17:20], s[0:1] offset:80 -; GFX900-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:32 -; GFX900-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:48 -; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] -; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] offset:16 +; GFX900-NEXT: v_fma_f32 v15, v15, s15, s15 +; GFX900-NEXT: v_fma_f32 v14, v14, s14, s14 +; GFX900-NEXT: v_fma_f32 v13, v13, s13, s13 +; GFX900-NEXT: v_fma_f32 v12, v12, s12, s12 +; GFX900-NEXT: v_fma_f32 v1, v1, s21, s21 +; GFX900-NEXT: v_fma_f32 v0, v0, s20, s20 +; GFX900-NEXT: v_fma_f32 v7, v7, s19, s19 +; GFX900-NEXT: v_fma_f32 v6, v6, s18, s18 +; GFX900-NEXT: v_fma_f32 v5, v5, s17, s17 +; GFX900-NEXT: v_fma_f32 v4, v4, s16, s16 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:64 +; GFX900-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:48 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:96 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:112 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:80 +; GFX900-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GFX900-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xa4 +; GFX900-NEXT: s_waitcnt vmcnt(6) lgkmcnt(0) +; GFX900-NEXT: v_fma_f32 v11, v11, s23, s23 +; GFX900-NEXT: v_fma_f32 v10, v10, s22, s22 +; GFX900-NEXT: v_fma_f32 v9, v9, s21, s21 +; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: v_fma_f32 v15, v15, s19, s19 +; GFX900-NEXT: v_fma_f32 v14, v14, s18, s18 +; GFX900-NEXT: v_fma_f32 v13, v13, s17, s17 +; GFX900-NEXT: v_fma_f32 v12, v12, s16, s16 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_fma_f32 v7, v7, s15, s15 +; GFX900-NEXT: v_fma_f32 v6, v6, s14, s14 +; GFX900-NEXT: v_fma_f32 v5, v5, s13, s13 +; GFX900-NEXT: v_fma_f32 v4, v4, s12, s12 +; GFX900-NEXT: v_fma_f32 v3, v3, s11, s11 +; GFX900-NEXT: v_fma_f32 v2, v2, s10, s10 +; GFX900-NEXT: v_fma_f32 v1, v1, s9, s9 +; GFX900-NEXT: v_fma_f32 v0, v0, s8, s8 +; GFX900-NEXT: v_fma_f32 v8, v8, s20, s20 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:32 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:48 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GFX900-NEXT: s_endpgm ; ; PACKED-SDAG-LABEL: fma_v32_vs: @@ -2440,21 +2448,21 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 ; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 +; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v58, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16 -; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 -; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 -; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1] -; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:80 -; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 -; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 -; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v58, s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v58, s[0:1] +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v58, s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v58, s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v58, s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v58, s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v58, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v58, s[0:1] offset:96 ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[40:41] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[38:39] ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[42:43] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[50:51] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[48:49] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[56:57], s[16:17] ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[20:21] ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[22:23] ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[10:11] @@ -2462,39 +2470,40 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[14:15] ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[44:45] ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[46:47] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[38:39] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[48:49] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[50:51] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[16:17] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[18:19] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[18:19] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[40:41], v[40:41] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[8:9] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[38:39], v[38:39] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[36:37] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[36:37], v[36:37] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[8:9] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[36:37] +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1 +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[52:53], v[52:53] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[52:53], v[52:53] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[32:33], v[32:33] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[34:35], v[34:35] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[54:55], v[54:55] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[50:51], v[50:51] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[40:41], v[40:41] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[56:57], v[56:57] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[32:33], v[32:33] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[54:55], v[54:55] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[50:51], v[50:51] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[36:37], v[36:37] ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], v[46:47], v[46:47] ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], v[48:49], v[48:49] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[42:43], v[42:43] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[44:45], v[44:45] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[36:37], v[36:37] -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[38:39], v[38:39] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[42:43], v[42:43] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[44:45], v[44:45] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[38:39], v[38:39] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[40:41], v[40:41] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[34:35], v[34:35] ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:96 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:64 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:80 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1] -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v58, v[28:31], s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v58, v[24:27], s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v58, v[20:23], s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v58, v[16:19], s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v58, v[12:15], s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v58, v[8:11], s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v58, v[4:7], s[0:1] +; GFX1250-SDAG-NEXT: global_store_b128 v58, v[0:3], s[0:1] offset:16 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fma_v32_vs: @@ -2505,17 +2514,17 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 ; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v58, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1] -; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16 -; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 -; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:48 -; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:64 -; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 -; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 -; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v58, s[0:1] +; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v58, s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v58, s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v58, s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v58, s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v58, s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v58, s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v58, s[0:1] offset:112 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] @@ -2528,16 +2537,16 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[56:57], s[16:17] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[20:21] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[36:37], v[36:37] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[22:23] ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[38:39], v[38:39] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[40:41], v[40:41] ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[42:43], v[42:43] @@ -2551,20 +2560,20 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[52:53], v[52:53] ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[54:55], v[54:55] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[32:33], v[32:33] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[34:35], v[34:35] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[56:57], v[56:57] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[32:33], v[32:33] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37] -; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[34:35], v[34:35] +; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[36:37], v[36:37] ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1] -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:48 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:64 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:80 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:96 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[0:3], s[0:1] +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[4:7], s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[8:11], s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[12:15], s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[16:19], s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[20:23], s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[24:27], s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_store_b128 v58, v[28:31], s[0:1] offset:112 ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir index 69983faf2b154..2a3d3e938f091 100644 --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=0 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s # Check that %3 was not rematerialized before the last store since its operand %1 diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index 58375b6f8a8a4..c2e5e1cb2107d 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -139,12 +139,12 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[34:35] +; GFX906-NEXT: s_mov_b64 s[10:11], s[26:27] ; GFX906-NEXT: s_mov_b32 s12, s24 ; GFX906-NEXT: s_mov_b32 s13, s23 ; GFX906-NEXT: s_mov_b32 s14, s22 -; GFX906-NEXT: v_mov_b32_e32 v31, v32 ; GFX906-NEXT: s_mov_b32 s15, s21 -; GFX906-NEXT: s_mov_b64 s[10:11], s[26:27] +; GFX906-NEXT: v_mov_b32_e32 v31, v32 ; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23 ; GFX906-NEXT: v_mov_b32_e32 v40, v32 @@ -232,11 +232,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: v_readlane_b32 s12, v39, 3 -; GFX906-NEXT: v_mov_b32_e32 v31, v40 -; GFX906-NEXT: v_readlane_b32 s13, v39, 2 -; GFX906-NEXT: v_readlane_b32 s14, v39, 1 -; GFX906-NEXT: v_readlane_b32 s15, v39, 0 ; GFX906-NEXT: v_readlane_b32 s4, v39, 10 ; GFX906-NEXT: v_readlane_b32 s5, v39, 11 ; GFX906-NEXT: v_readlane_b32 s6, v39, 8 @@ -245,12 +240,18 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: v_readlane_b32 s9, v39, 7 ; GFX906-NEXT: v_readlane_b32 s10, v39, 4 ; GFX906-NEXT: v_readlane_b32 s11, v39, 5 +; GFX906-NEXT: v_readlane_b32 s12, v39, 3 +; GFX906-NEXT: v_readlane_b32 s13, v39, 2 +; GFX906-NEXT: v_readlane_b32 s14, v39, 1 +; GFX906-NEXT: v_readlane_b32 s15, v39, 0 +; GFX906-NEXT: v_mov_b32_e32 v31, v40 ; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23 ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX906-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX906-NEXT: s_mov_b64 exec, s[34:35] +; GFX906-NEXT: v_mov_b32_e32 v31, v40 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_readlane_b32 s4, v39, 10 ; GFX906-NEXT: v_readlane_b32 s5, v39, 11 @@ -264,7 +265,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: v_readlane_b32 s13, v39, 2 ; GFX906-NEXT: v_readlane_b32 s14, v39, 1 ; GFX906-NEXT: v_readlane_b32 s15, v39, 0 -; GFX906-NEXT: v_mov_b32_e32 v31, v40 ; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23 ; GFX906-NEXT: v_readlane_b32 s21, v39, 12 @@ -528,12 +528,12 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX908-NEXT: s_mov_b64 exec, s[34:35] +; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27] ; GFX908-NEXT: s_mov_b32 s12, s24 ; GFX908-NEXT: s_mov_b32 s13, s23 ; GFX908-NEXT: s_mov_b32 s14, s22 -; GFX908-NEXT: v_mov_b32_e32 v31, v32 ; GFX908-NEXT: s_mov_b32 s15, s21 -; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27] +; GFX908-NEXT: v_mov_b32_e32 v31, v32 ; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 ; GFX908-NEXT: v_mov_b32_e32 v40, v32 @@ -621,11 +621,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: v_readlane_b32 s12, v39, 3 -; GFX908-NEXT: v_mov_b32_e32 v31, v40 -; GFX908-NEXT: v_readlane_b32 s13, v39, 2 -; GFX908-NEXT: v_readlane_b32 s14, v39, 1 -; GFX908-NEXT: v_readlane_b32 s15, v39, 0 ; GFX908-NEXT: v_readlane_b32 s4, v39, 10 ; GFX908-NEXT: v_readlane_b32 s5, v39, 11 ; GFX908-NEXT: v_readlane_b32 s6, v39, 8 @@ -634,12 +629,18 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: v_readlane_b32 s9, v39, 7 ; GFX908-NEXT: v_readlane_b32 s10, v39, 4 ; GFX908-NEXT: v_readlane_b32 s11, v39, 5 +; GFX908-NEXT: v_readlane_b32 s12, v39, 3 +; GFX908-NEXT: v_readlane_b32 s13, v39, 2 +; GFX908-NEXT: v_readlane_b32 s14, v39, 1 +; GFX908-NEXT: v_readlane_b32 s15, v39, 0 +; GFX908-NEXT: v_mov_b32_e32 v31, v40 ; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX908-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX908-NEXT: s_mov_b64 exec, s[34:35] +; GFX908-NEXT: v_mov_b32_e32 v31, v40 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readlane_b32 s4, v39, 10 ; GFX908-NEXT: v_readlane_b32 s5, v39, 11 @@ -653,7 +654,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: v_readlane_b32 s13, v39, 2 ; GFX908-NEXT: v_readlane_b32 s14, v39, 1 ; GFX908-NEXT: v_readlane_b32 s15, v39, 0 -; GFX908-NEXT: v_mov_b32_e32 v31, v40 ; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 ; GFX908-NEXT: v_readlane_b32 s21, v39, 12 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 098a60dd61a1c..14951c63a4897 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -365,110 +365,110 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 17, v0 -; GFX8-NEXT: v_and_b32_e32 v10, 0xfe000000, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xfe000000, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: s_movk_i32 s0, 0x2800 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, 0 +; GFX8-NEXT: s_movk_i32 s2, 0x2800 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, 0x7f -; GFX8-NEXT: s_movk_i32 s1, 0x800 -; GFX8-NEXT: s_movk_i32 s2, 0x1000 -; GFX8-NEXT: s_movk_i32 s3, 0x1800 -; GFX8-NEXT: s_movk_i32 s4, 0x2000 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v7, 0x7f +; GFX8-NEXT: s_movk_i32 s3, 0x800 +; GFX8-NEXT: s_movk_i32 s4, 0x1000 +; GFX8-NEXT: s_movk_i32 s5, 0x1800 +; GFX8-NEXT: s_movk_i32 s6, 0x2000 ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_mov_b32 s5, 0 +; GFX8-NEXT: s_mov_b32 s7, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffd800, v2 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[2:3] -; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[8:9] ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0xffffe000, v2 +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xffffe800, v2 ; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffe800, v2 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[8:9] +; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[2:3] +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[14:15] ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xfffff000, v2 ; GFX8-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xfffff800, v2 -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[16:17] -; GFX8-NEXT: v_add_u32_e32 v16, vcc, s1, v2 -; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc -; GFX8-NEXT: s_addk_i32 s5, 0x2000 -; GFX8-NEXT: s_cmp_gt_u32 s5, 0x3fffff -; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v22, vcc, v14, v6 -; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v15, v7, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, s2, v2 -; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[20:21] +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xfffff800, v2 ; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[16:17] -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v20, vcc, s3, v2 -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v3, vcc -; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v22, vcc, v12, v22 -; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v13, v23, vcc -; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v2 -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] -; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[20:21] -; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; GFX8-NEXT: v_addc_u32_e32 v19, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[18:19] +; GFX8-NEXT: s_addk_i32 s7, 0x2000 +; GFX8-NEXT: s_cmp_gt_u32 s7, 0x3fffff +; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v8, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s3, v2 +; GFX8-NEXT: v_addc_u32_e64 v9, s[0:1], v9, v5, s[0:1] +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v22, vcc, v18, v22 -; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v19, v23, vcc -; GFX8-NEXT: v_add_u32_e32 v18, vcc, s0, v2 +; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], v12, v8 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v2 +; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], v13, v9, s[0:1] +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], v14, v12 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, s5, v2 +; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], v15, v13, s[0:1] +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] ; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] -; GFX8-NEXT: v_addc_u32_e32 v19, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[18:19] +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], v16, v14 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, s6, v2 +; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], v17, v15, s[0:1] +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u32_e64 v18, s[0:1], v18, v16 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s2, v2 +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[14:15] +; GFX8-NEXT: v_addc_u32_e64 v19, s[0:1], v19, v17, s[0:1] +; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[16:17] ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x10000, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v22 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v23, vcc -; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v14, v8 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v15, v9, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v18 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v19, vcc ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v16, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v17, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v11, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v8, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v20, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v21, v5, vcc -; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v14, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v18, v4 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v19, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v16, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v17, v5, vcc ; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 1, v11 +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 1, v7 ; GFX8-NEXT: s_and_b64 vcc, exec, vcc ; GFX8-NEXT: s_cbranch_vccz .LBB1_1 ; GFX8-NEXT: ; %bb.4: ; %while.end ; GFX8-NEXT: v_mov_b32_e32 v1, s35 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v10 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v6 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7] +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: clmem_read: @@ -517,57 +517,58 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffe000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc -; GFX900-NEXT: global_load_dwordx2 v[14:15], v[8:9], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[18:19], v[8:9], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[20:21], v[8:9], off +; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, s2, v2 +; GFX900-NEXT: v_add_co_u32_e64 v14, s[0:1], s3, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v3, vcc +; GFX900-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v3, s[0:1] +; GFX900-NEXT: global_load_dwordx2 v[8:9], v[16:17], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[22:23], v[14:15], off offset:2048 ; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096 ; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048 ; GFX900-NEXT: s_addk_i32 s5, 0x2000 ; GFX900-NEXT: s_cmp_gt_u32 s5, 0x3fffff -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v14, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v15, v5, vcc -; GFX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX900-NEXT: global_load_dwordx2 v[14:15], v[8:9], off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v14, v16 -; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v17, vcc -; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, s2, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v8, v14 -; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v9, v15, vcc -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:2048 -; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v10, v14 -; GFX900-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v15, vcc -; GFX900-NEXT: v_add_co_u32_e64 v14, s[0:1], v12, v14 -; GFX900-NEXT: v_addc_co_u32_e64 v15, s[0:1], v13, v11, s[0:1] -; GFX900-NEXT: v_add_co_u32_e32 v10, vcc, s3, v2 -; GFX900-NEXT: v_add_co_u32_e64 v12, s[0:1], s4, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v3, vcc -; GFX900-NEXT: v_addc_co_u32_e64 v13, vcc, 0, v3, s[0:1] -; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v4, v14 -; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v5, v15, vcc -; GFX900-NEXT: global_load_dwordx2 v[4:5], v[12:13], off offset:-4096 -; GFX900-NEXT: global_load_dwordx2 v[14:15], v[10:11], off offset:2048 -; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v18, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v19, v5, vcc +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v20, v14 +; GFX900-NEXT: v_add_co_u32_e64 v14, s[0:1], s4, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v21, v15, vcc +; GFX900-NEXT: v_addc_co_u32_e64 v15, vcc, 0, v3, s[0:1] +; GFX900-NEXT: s_waitcnt vmcnt(3) ; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v8, v16 +; GFX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v9, v17, vcc -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[12:13], off -; GFX900-NEXT: global_load_dwordx2 v[10:11], v[12:13], off offset:2048 +; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v10, v16 +; GFX900-NEXT: global_load_dwordx2 v[8:9], v[14:15], off offset:2048 +; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v11, v17, vcc +; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:2048 +; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: v_add_co_u32_e32 v18, vcc, v12, v16 +; GFX900-NEXT: v_addc_co_u32_e32 v19, vcc, v13, v17, vcc +; GFX900-NEXT: global_load_dwordx2 v[12:13], v[14:15], off offset:-4096 +; GFX900-NEXT: global_load_dwordx2 v[16:17], v[14:15], off ; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v16 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v17, vcc +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v18 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v19, vcc ; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc ; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v22, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v23, v5, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc ; GFX900-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index b761f689d6af5..c6733cff5b8f0 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -70,23 +70,23 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v9, vcc ; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] -; GFX9-NEXT: v_or_b32_e32 v12, v7, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc ; GFX9-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX9-NEXT: v_xor_b32_e32 v11, 0x7f, v6 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 -; GFX9-NEXT: v_or_b32_e32 v11, v11, v8 +; GFX9-NEXT: v_xor_b32_e32 v10, 0x7f, v6 +; GFX9-NEXT: v_or_b32_e32 v11, v7, v9 +; GFX9-NEXT: v_or_b32_e32 v10, v10, v8 ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[11:12] +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v3, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v3, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v11, v1, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v0, 0, s[4:5] ; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB0_6 @@ -107,118 +107,118 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_or_b32_e32 v8, v10, v12 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v13 -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v13, v[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v13, v[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v11, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v9, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-NEXT: v_sub_u32_e32 v12, 64, v24 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[10:11], v24, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[12:13], v12, v[2:3] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX9-NEXT: v_or_b32_e32 v12, v8, v12 -; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v24 -; GFX9-NEXT: v_or_b32_e32 v13, v9, v13 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v12, v10, v12 +; GFX9-NEXT: v_subrev_u32_e32 v10, 64, v24 +; GFX9-NEXT: v_or_b32_e32 v13, v11, v13 +; GFX9-NEXT: v_lshrrev_b64 v[10:11], v10, v[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v15, v9, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v12, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v17, v11, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v12, v10, v12, vcc +; GFX9-NEXT: v_lshrrev_b64 v[10:11], v24, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v16, v12, v0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v10, vcc ; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, -1, v23 ; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v22, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, -1, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v12, 0 -; GFX9-NEXT: v_mov_b32_e32 v18, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, -1, v5, vcc ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: .LBB0_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 31, v15 -; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[14:15] -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 31, v7 -; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 31, v11 +; GFX9-NEXT: v_lshlrev_b64 v[18:19], 1, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 31, v17 ; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[16:17] -; GFX9-NEXT: v_or_b32_e32 v14, v14, v33 -; GFX9-NEXT: v_or3_b32 v6, v6, v8, v12 -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v28, v14 -; GFX9-NEXT: v_or_b32_e32 v16, v16, v32 -; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v29, v15, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v30, v16, vcc -; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] -; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v31, v17, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8 ; GFX9-NEXT: v_or_b32_e32 v10, v18, v10 -; GFX9-NEXT: v_and_b32_e32 v18, v8, v23 -; GFX9-NEXT: v_or_b32_e32 v11, v19, v11 -; GFX9-NEXT: v_and_b32_e32 v19, v8, v22 -; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v14, v18 -; GFX9-NEXT: v_and_b32_e32 v32, v8, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, v15, v19, vcc -; GFX9-NEXT: v_and_b32_e32 v33, v8, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v16, v32, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v17, v33, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 31, v7 +; GFX9-NEXT: v_or_b32_e32 v16, v16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 31, v9 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] +; GFX9-NEXT: v_or_b32_e32 v8, v14, v8 +; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v28, v16 +; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v29, v17, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v30, v10, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v31, v19, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v32, 31, v14 +; GFX9-NEXT: v_and_b32_e32 v14, v32, v23 +; GFX9-NEXT: v_or_b32_e32 v9, v15, v9 +; GFX9-NEXT: v_and_b32_e32 v15, v32, v22 +; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, v32, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v17, v15, vcc +; GFX9-NEXT: v_or3_b32 v6, v6, v18, v12 +; GFX9-NEXT: v_and_b32_e32 v15, v32, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v10, v14, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v19, v15, vcc ; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, -1, v24 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v26, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v27, vcc -; GFX9-NEXT: v_or_b32_e32 v18, v24, v26 -; GFX9-NEXT: v_or_b32_e32 v19, v25, v27 -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] -; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX9-NEXT: v_mov_b32_e32 v19, v9 +; GFX9-NEXT: v_or_b32_e32 v14, v24, v26 +; GFX9-NEXT: v_or_b32_e32 v15, v25, v27 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_and_b32_e32 v10, 1, v32 +; GFX9-NEXT: v_mov_b32_e32 v15, v11 ; GFX9-NEXT: v_or3_b32 v7, v7, 0, v13 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v18, v8 +; GFX9-NEXT: v_mov_b32_e32 v14, v10 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB0_3 ; GFX9-NEXT: ; %bb.4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: .LBB0_5: ; %Flow2 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[10:11] +; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 31, v11 -; GFX9-NEXT: v_or3_b32 v10, v7, 0, v13 -; GFX9-NEXT: v_or3_b32 v12, v6, v11, v12 -; GFX9-NEXT: v_or_b32_e32 v11, v9, v15 -; GFX9-NEXT: v_or_b32_e32 v13, v8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 31, v9 +; GFX9-NEXT: v_or3_b32 v13, v7, 0, v13 +; GFX9-NEXT: v_or3_b32 v12, v6, v8, v12 +; GFX9-NEXT: v_or_b32_e32 v11, v11, v15 +; GFX9-NEXT: v_or_b32_e32 v10, v10, v14 ; GFX9-NEXT: .LBB0_6: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v16, v13, v5 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v23, v13, 0 +; GFX9-NEXT: v_mul_lo_u32 v17, v10, v5 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v23, v10, 0 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, 0 -; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v22, v13, v[6:7] -; GFX9-NEXT: v_mul_lo_u32 v15, v11, v4 -; GFX9-NEXT: v_mul_lo_u32 v4, v12, v22 -; GFX9-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-NEXT: v_mul_lo_u32 v16, v11, v4 +; GFX9-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v22, v10, v[6:7] +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, v14 ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v23, v11, v[6:7] -; GFX9-NEXT: v_add3_u32 v9, v9, v16, v15 +; GFX9-NEXT: v_add3_u32 v9, v9, v17, v16 ; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v23, v[8:9] -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v14, v7 +; GFX9-NEXT: v_mul_lo_u32 v4, v12, v22 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v15, v7 +; GFX9-NEXT: v_mul_lo_u32 v14, v13, v23 ; GFX9-NEXT: v_addc_co_u32_e64 v13, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mul_lo_u32 v15, v10, v23 ; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v11, v[12:13] -; GFX9-NEXT: v_add3_u32 v4, v15, v9, v4 +; GFX9-NEXT: v_add3_u32 v4, v14, v9, v4 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v10, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v4, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 @@ -1496,61 +1496,61 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_lshrrev_b64 v[12:13], v12, v[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v17, v13, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v19, v13, v1, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v14, v12, v14, vcc ; GFX9-NEXT: v_lshrrev_b64 v[12:13], v22, v[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v16, v14, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v18, v14, v0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v21, 0, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v20, 0, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, -1, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, -1, v6, vcc ; GFX9-NEXT: v_mov_b32_e32 v14, 0 -; GFX9-NEXT: v_mov_b32_e32 v20, 0 +; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v7, vcc ; GFX9-NEXT: v_mov_b32_e32 v15, 0 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: v_mov_b32_e32 v21, 0 +; GFX9-NEXT: v_mov_b32_e32 v17, 0 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: .LBB1_3: ; %udiv-do-while ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_lshlrev_b64 v[30:31], 1, v[10:11] -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v11 -; GFX9-NEXT: v_or_b32_e32 v10, v20, v30 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 31, v17 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[16:17] -; GFX9-NEXT: v_or_b32_e32 v11, v21, v31 +; GFX9-NEXT: v_lshlrev_b64 v[20:21], 1, v[20:21] +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v19 ; GFX9-NEXT: v_lshlrev_b64 v[18:19], 1, v[18:19] -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 31, v9 -; GFX9-NEXT: v_or_b32_e32 v16, v16, v21 +; GFX9-NEXT: v_or_b32_e32 v12, v20, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 31, v9 ; GFX9-NEXT: v_or_b32_e32 v18, v18, v20 -; GFX9-NEXT: v_sub_co_u32_e32 v20, vcc, v26, v16 -; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v27, v17, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v28, v18, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v29, v19, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v20 -; GFX9-NEXT: v_and_b32_e32 v20, v30, v4 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] -; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v16, v20 -; GFX9-NEXT: v_and_b32_e32 v20, v30, v5 -; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v17, v20, vcc -; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 -; GFX9-NEXT: v_and_b32_e32 v12, v30, v6 -; GFX9-NEXT: v_and_b32_e32 v20, v30, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v18, v12, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v20, vcc, v26, v18 +; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v27, v19, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v28, v12, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v29, v21, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v32, 31, v20 +; GFX9-NEXT: v_and_b32_e32 v20, v32, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v18, vcc, v18, v20 +; GFX9-NEXT: v_and_b32_e32 v20, v32, v5 ; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v19, v20, vcc +; GFX9-NEXT: v_and_b32_e32 v20, v32, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v20, vcc, v12, v20, vcc +; GFX9-NEXT: v_and_b32_e32 v12, v32, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v21, vcc, v21, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v22 ; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc -; GFX9-NEXT: v_or_b32_e32 v20, v22, v24 -; GFX9-NEXT: v_or_b32_e32 v21, v23, v25 -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] -; GFX9-NEXT: v_and_b32_e32 v12, 1, v30 -; GFX9-NEXT: v_mov_b32_e32 v21, v13 +; GFX9-NEXT: v_or_b32_e32 v30, v22, v24 +; GFX9-NEXT: v_or_b32_e32 v31, v23, v25 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 31, v11 +; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[30:31] +; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14 +; GFX9-NEXT: v_or_b32_e32 v11, v17, v11 +; GFX9-NEXT: v_or_b32_e32 v10, v16, v10 +; GFX9-NEXT: v_and_b32_e32 v12, 1, v32 +; GFX9-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v20, v12 +; GFX9-NEXT: v_mov_b32_e32 v16, v12 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execnz .LBB1_3 ; GFX9-NEXT: ; %bb.4: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index e29be2b744874..c6d5a353de96e 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -398,44 +398,42 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK-NEXT: v_mov_b32_e32 v10, v8 ; CHECK-NEXT: v_mov_b32_e32 v11, v8 ; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v6 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19] ; CHECK-NEXT: v_mov_b64_e32 v[0:1], 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11] ; CHECK-NEXT: global_store_short v[0:1], v2, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[28:29], v[16:19] ; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[8:11] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[20:23] ; CHECK-NEXT: s_nop 5 ; CHECK-NEXT: v_cvt_f16_f32_e32 v10, v6 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[12:15] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[28:29], v[16:19] ; CHECK-NEXT: global_store_short v[0:1], v10, off -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[2:5] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[6:9] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[10:13], v[26:27], v[26:27], v[12:15] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[28:29], v[26:27], v[6:9] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[2:5] +; CHECK-NEXT: s_nop 4 +; CHECK-NEXT: v_cvt_f16_f32_e32 v10, v10 ; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CHECK-NEXT: global_store_short v[0:1], v6, off -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[20:23] +; CHECK-NEXT: global_store_short v[0:1], v10, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 +; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CHECK-NEXT: global_store_short v[0:1], v24, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[28:29], v[26:27], v[2:5] -; CHECK-NEXT: s_nop 6 -; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v2 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[16:19] ; CHECK-NEXT: global_store_short v[0:1], v6, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CHECK-NEXT: global_store_short v[0:1], v2, off ; CHECK-NEXT: s_endpgm entry: @@ -588,8 +586,8 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a34, a33, v[0:31] -; CHECK-NEXT: v_mov_b32_e32 v1, 0x41000000 ; CHECK-NEXT: v_accvgpr_read_b32 v0, a32 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x41800000 ; CHECK-NEXT: s_nop 15 ; CHECK-NEXT: v_mov_b64_e32 v[2:3], v[32:33] ; CHECK-NEXT: v_mov_b64_e32 v[4:5], v[34:35] @@ -616,10 +614,9 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add ; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32 ; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 ; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v26, 0x41000000 ; CHECK-NEXT: s_nop 1 -; CHECK-NEXT: v_mov_b32_e32 v2, 0x41800000 -; CHECK-NEXT: s_nop 1 -; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] +; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v26, v1, a[0:31] ; CHECK-NEXT: s_nop 15 ; CHECK-NEXT: s_nop 1 ; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[2:3] offset:96 diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll index e34fdd9ae6902..06a5509e70e0c 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll @@ -1671,7 +1671,7 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] ; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 ; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 -; SI-GISEL-NEXT: v_mov_b32_e32 v18, 0x3ff00000 +; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0x3ff00000 ; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[0:1], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 @@ -1713,22 +1713,23 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] ; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] ; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v18 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[10:11], v[14:15], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v20 +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] +; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] -; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[14:15] -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[16:17], v[6:7] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[16:17] +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v19, v20 ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; SI-GISEL-NEXT: s_nop 0 ; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] ; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1967,7 +1968,7 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] ; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 ; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 -; SI-GISEL-NEXT: v_mov_b32_e32 v18, 0xbff00000 +; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0xbff00000 ; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[0:1], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 @@ -2009,22 +2010,23 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] ; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] ; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v18 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[10:11], v[14:15], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v20 +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] +; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7] -; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[6:7], -1.0, v[2:3], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[14:15] -; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[16:17], v[6:7] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v17, v18 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[16:17] +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v19, v20 ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 +; SI-GISEL-NEXT: s_nop 0 ; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] ; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2230,6 +2232,8 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[0:1], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 @@ -2237,61 +2241,59 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7] -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v12 -; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v10, 8, v12 ; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v10 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc -; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5 -; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 -; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] ; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[8:9], v[8:9], v[2:3] -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[10:11] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v12, s[4:5] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[6:7] -; SI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], s[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0 ; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 -; SI-GISEL-NEXT: v_rcp_f64_e32 v[14:15], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] +; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] ; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v10 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[6:7], v[14:15], 1.0 -; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], s[4:5], v[2:3], s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], s[4:5], v[2:3], s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[14:15], v[12:13], v[14:15] +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_mul_f64 v[12:13], v[16:17], v[10:11] -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[18:19], v[4:5], v[8:9] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], v[16:17] -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v17 -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v19 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 ; SI-GISEL-NEXT: s_nop 0 -; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[10:11], v[12:13] +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] ; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], s[4:5] ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2500,6 +2502,8 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80 +; SI-GISEL-NEXT: v_mov_b32_e32 v15, 0x260 ; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 ; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[0:1], v[6:7] ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 @@ -2507,62 +2511,60 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1] ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7] -; SI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v12 -; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v10, 8, v12 ; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; SI-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3] +; SI-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v10 ; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v12, 0xffffff80 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v12, vcc -; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[10:11], 0.5 -; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[2:3], v[10:11] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v13 -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5 -; SI-GISEL-NEXT: v_mov_b32_e32 v13, 0x260 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v13 -; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3] +; SI-GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15 +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], 0.5 +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; SI-GISEL-NEXT: v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] ; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[8:9], v[8:9], v[2:3] -; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v13 -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] -; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[10:11] -; SI-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v12, s[4:5] -; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, v14, s[4:5] +; SI-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], v[6:7] -; SI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[4:5], 1.0 -; SI-GISEL-NEXT: v_rcp_f64_e32 v[14:15], v[6:7] -; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[12:13], v[4:5] -; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], 1.0 -; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], v[12:13] -; SI-GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13] +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[8:9] +; SI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0 ; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7] ; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v13, v10 -; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[6:7], v[14:15], 1.0 -; SI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] ; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v11 -; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[14:15], v[12:13], v[14:15] +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[6:7] ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; SI-GISEL-NEXT: v_mul_f64 v[12:13], v[16:17], v[10:11] -; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[18:19], v[4:5], v[8:9] -; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], v[16:17] -; SI-GISEL-NEXT: v_mov_b32_e32 v6, 0x3ff00000 -; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v17, v6 -; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0x3ff00000 +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v19, v8 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 ; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 ; SI-GISEL-NEXT: s_nop 0 -; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[10:11], v[12:13] +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] ; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir index b754a6b897159..60c0cb83c1ed9 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -24,25 +24,25 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_512 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_512 = COPY [[V_MOV_B32_e32_]] + ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]].sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32), align 8, addrspace 5) + ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_512 = COPY [[COPY]].sub0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_512 = COPY [[COPY]].sub1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:vreg_512 = COPY [[COPY]].sub3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub5:vreg_512 = COPY undef [[V_MOV_B32_e32_1]] + ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 0, [[V_MOV_B32_e32_1]], implicit $exec ; CHECK-NEXT: dead [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %7:vgpr_32, 0, 0, implicit $exec ; CHECK-NEXT: dead [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec - ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 0, [[V_MOV_B32_e32_1]], implicit $exec ; CHECK-NEXT: dead [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_512 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead [[COPY1]], 1245194 /* regdef:VGPR_32 */, def dead [[COPY]].sub1, 1245193 /* reguse:VGPR_32 */, [[COPY1]], 1245193 /* reguse:VGPR_32 */, [[COPY]].sub1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub0:vreg_512 = COPY [[COPY]].sub0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub3:vreg_512 = COPY [[COPY]].sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_1]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub5:vreg_512 = COPY undef [[V_MOV_B32_e32_1]] - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_512 = COPY [[COPY2]] + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead [[COPY2]], 1245194 /* regdef:VGPR_32 */, def dead undef [[COPY]].sub1, 1245193 /* reguse:VGPR_32 */, [[COPY2]], 1245193 /* reguse:VGPR_32 */, [[COPY]].sub1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_512 = COPY [[COPY1]] ; CHECK-NEXT: dead [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_2]], implicit-def dead $vcc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir index 5edb9669d98eb..b2412f6ac368b 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir @@ -43,7 +43,7 @@ body: | ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead %11 ; CHECK-NEXT: GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, implicit $exec :: (store (s32), addrspace 1) ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]] ; CHECK-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3) ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %15, 1245194 /* regdef:VGPR_32 */, def %16 ; CHECK-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec @@ -51,8 +51,8 @@ body: | ; CHECK-NEXT: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec ; CHECK-NEXT: INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def %21, 1245194 /* regdef:VGPR_32 */, def %22 ; CHECK-NEXT: [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 1245194 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_3]], 1245194 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_4]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 1245193 /* reguse:VGPR_32 */, %15, 1245193 /* reguse:VGPR_32 */, %16, 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_1]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_3]], 1245193 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_2]] - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]] ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3) ; CHECK-NEXT: DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3) ; CHECK-NEXT: DS_WRITE_B64_gfx9 undef %30:vgpr_32, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3) diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll index f70cd6816a966..7009ef5c74cfa 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll @@ -1,5 +1,5 @@ ; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack -amdgpu-use-amdgpu-trackers=1 2>&1 < %s | FileCheck -check-prefixes=ERR-GCNTRACKERS %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack 2>&1 < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack -amdgpu-use-amdgpu-trackers=0 2>&1 < %s | FileCheck -check-prefixes=GCN %s %asm.output = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs <16 x i32>, <7 x i32>, ; vgprs diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll index 0d25bc97ff775..b6863d557deb4 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 -amdgpu-use-amdgpu-trackers=0 < %s | FileCheck --check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 -amdgpu-use-amdgpu-trackers=1 < %s | FileCheck --check-prefix=GCN-GCNTRACKERS %s ; CHECK-LABEL: {{^}}spill: diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll index a29dc34c56d3a..9c09f0ea85cc0 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -1,10 +1,10 @@ -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -amdgpu-use-amdgpu-trackers=0 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL-GCNTRACKERS %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -verify-misched < %s | FileCheck --check-prefixes=TONGA %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -amdgpu-use-amdgpu-trackers=0 -verify-misched < %s | FileCheck --check-prefixes=TONGA %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=TONGA-GCNTRACKERS %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched < %s | FileCheck --check-prefixes=GFX908 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-use-amdgpu-trackers=0 -verify-misched < %s | FileCheck --check-prefixes=GFX908 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX908-GCNTRACKERS %s -; RUN: llc -mtriple=amdgcn -verify-misched < %s | FileCheck --check-prefixes=GENERIC %s +; RUN: llc -mtriple=amdgcn -amdgpu-use-amdgpu-trackers=0 -verify-misched < %s | FileCheck --check-prefixes=GENERIC %s ; RUN: llc -mtriple=amdgcn -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GENERIC-GCNTRACKERS %s ; GCN Trackers are sensitive to minor changes in RP, and will avoid scheduling certain instructions, which, if scheduled, diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir index 88e11c9ce3d1d..69381edc0514b 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir @@ -17,20 +17,20 @@ body: | ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub3:vreg_128 = COPY $vgpr9 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub2:vreg_128 = COPY $vgpr8 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_128 = COPY $vgpr7 - ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 - ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 - ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_128 = COPY $vgpr6 - ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub3:vreg_128 = COPY $vgpr5 - ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub2:vreg_128 = COPY $vgpr4 - ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub1:vreg_128 = COPY $vgpr3 - ; CHECK-NEXT: undef [[COPY8:%[0-9]+]].sub0:vreg_128 = COPY $vgpr2 - ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub1:sgpr_128 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY6]].sub2, implicit $exec - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY5]].sub3, implicit $exec + ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_128 = COPY $vgpr6 + ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub3:vreg_128 = COPY $vgpr5 + ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub2:vreg_128 = COPY $vgpr4 + ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub1:vreg_128 = COPY $vgpr3 + ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub0:vreg_128 = COPY $vgpr2 + ; CHECK-NEXT: undef [[COPY8:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY8:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0 ; CHECK-NEXT: S_BARRIER + ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub1:sgpr_128 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY5]].sub2, implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY4]].sub3, implicit $exec ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec - ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY4]].sub0, implicit $exec + ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY3]].sub0, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub1:sgpr_128 = V_READFIRSTLANE_B32 [[COPY2]].sub1, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY1]].sub2, implicit $exec ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY]].sub3, implicit $exec @@ -38,7 +38,7 @@ body: | ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET]], [[BUFFER_LOAD_DWORD_OFFSET]], implicit $exec ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET1]], [[BUFFER_LOAD_DWORD_OFFSET1]], implicit $exec ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MUL_LO_U32_e64_]], [[V_MUL_LO_U32_e64_1]], implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[V_ADD_U32_e32_]], 0, 0, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY8]], [[V_ADD_U32_e32_]], 0, 0, implicit $exec ; CHECK-NEXT: S_ENDPGM 0 undef %43.sub3:vreg_128 = COPY $vgpr9 undef %42.sub2:vreg_128 = COPY $vgpr8 diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir index aa0d1fe45e9a8..1b3e6756acc9d 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -amdgpu-use-amdgpu-trackers=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s --- | diff --git a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll index 7b8eba1091b48..4f8f229d0b43d 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=OCC %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=0 < %s | FileCheck --check-prefix=OCC %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 < %s | FileCheck --check-prefix=OCC-GCNTRACKER %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-schedule-relaxed-occupancy=true -amdgpu-use-amdgpu-trackers=0 < %s | FileCheck --check-prefix=RELAX %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX-GCNTRACKER %s diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll index 840916aa63949..3674872092ec2 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -596,7 +596,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 @@ -616,66 +616,68 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v30 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192 +; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v25, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3703c499 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[25:28], s0 offset:816 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v8, 0x200, v30 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v30, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, v5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v25 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v8, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog @@ -777,7 +779,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) @@ -801,66 +803,68 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v30 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v25, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[25:28], s0 offset:816 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v8, 0x200, v30 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v25 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v8, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog @@ -1619,7 +1623,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 @@ -1639,66 +1643,68 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v30 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192 +; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v25, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3703c499 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[25:28], s0 offset:816 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v8, 0x200, v30 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v30, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, v5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v25 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v8, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog @@ -1800,7 +1806,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) @@ -1824,66 +1830,68 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v30 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v25, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[25:28], s0 offset:816 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v8, 0x200, v30 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v25 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v8, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog @@ -2642,7 +2650,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 @@ -2662,66 +2670,68 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v30 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192 +; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v25, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3703c499 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[25:28], s0 offset:816 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v8, 0x200, v30 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v30, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, v5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v25 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v8, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog @@ -2823,7 +2833,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x10 ; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) @@ -2847,66 +2857,68 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v30 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v25, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[25:28], s0 offset:816 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v8, 0x200, v30 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v25 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v8, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog @@ -3662,7 +3674,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 @@ -3682,66 +3694,68 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v30 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192 +; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v25, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3703c499 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[25:28], s0 offset:816 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v8, 0x200, v30 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v30, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, v5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v25 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v8, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog @@ -3843,7 +3857,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) @@ -3867,66 +3881,68 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v30 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v25, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[25:28], s0 offset:816 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v8, 0x200, v30 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v25 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v8, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog @@ -4682,7 +4698,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 @@ -4702,66 +4718,68 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v30 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192 +; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v25, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3703c499 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[25:28], s0 offset:816 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v8, 0x200, v30 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v30, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, v5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v25 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v8, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog @@ -4863,7 +4881,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) @@ -4887,66 +4905,68 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v30 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v25, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[25:28], s0 offset:816 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v8, 0x200, v30 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v25 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v8, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog @@ -5711,7 +5731,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 @@ -5731,66 +5751,68 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v30 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192 +; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v25, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3703c499 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[25:28], s0 offset:816 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v8, 0x200, v30 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v30, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, v5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v25 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v8, off ; GFX9-FLATSCR-NEXT: s_mov_b32 s2, s7 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 @@ -5894,7 +5916,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) @@ -5918,66 +5940,68 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v30 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v25, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[25:28], s0 offset:816 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v8, 0x200, v30 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v25 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v8, off ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 @@ -6743,7 +6767,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 @@ -6763,66 +6787,68 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v30 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192 +; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v25, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3703c499 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[25:28], s0 offset:816 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v8, 0x200, v30 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v30, v5 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v28, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v29, v5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v25 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v8, off ; GFX9-FLATSCR-NEXT: s_mov_b32 s2, s7 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 @@ -6926,7 +6952,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v30, 0x1fc, v0 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) @@ -6950,66 +6976,68 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v21 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v30 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v25, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v17 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[25:28], s0 offset:816 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v8, 0x200, v30 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v5 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v25 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[26:29], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:720 ; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v8, off ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5 ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index 441509ba01f64..e760183a16fd2 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -792,255 +792,255 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: sdiv_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s10, s6 -; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_mov_b32 s6, s10 +; GCN-NEXT: s_mov_b32 s7, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s2 -; GCN-NEXT: s_mov_b32 s9, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s5, s3 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GCN-NEXT: s_mov_b32 s8, s0 +; GCN-NEXT: s_mov_b32 s9, s1 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 ; GCN-NEXT: v_xor_b32_e32 v8, v0, v4 ; GCN-NEXT: v_max_i32_e32 v4, v4, v10 -; GCN-NEXT: v_cvt_f32_u32_e32 v10, v4 -; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 ; GCN-NEXT: v_xor_b32_e32 v11, v1, v5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GCN-NEXT: v_max_i32_e32 v5, v5, v13 -; GCN-NEXT: v_cvt_f32_u32_e32 v13, v5 -; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 -; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GCN-NEXT: v_rcp_iflag_f32_e32 v13, v13 -; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 -; GCN-NEXT: v_mul_lo_u32 v16, v16, v10 -; GCN-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13 -; GCN-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GCN-NEXT: v_max_i32_e32 v0, v0, v9 -; GCN-NEXT: v_mul_hi_u32 v16, v10, v16 ; GCN-NEXT: v_max_i32_e32 v1, v1, v12 -; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v6 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v16 -; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 -; GCN-NEXT: v_mul_lo_u32 v16, v16, v13 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v10 +; GCN-NEXT: v_cvt_f32_u32_e32 v12, v4 +; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 +; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 ; GCN-NEXT: v_xor_b32_e32 v14, v2, v6 -; GCN-NEXT: v_max_i32_e32 v6, v6, v15 -; GCN-NEXT: v_mul_hi_u32 v12, v13, v16 -; GCN-NEXT: v_cvt_f32_u32_e32 v15, v6 -; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GCN-NEXT: v_mul_lo_u32 v13, v10, v4 -; GCN-NEXT: v_mul_hi_u32 v12, v1, v12 -; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v15 -; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v14 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v13, vcc, v0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1] +; GCN-NEXT: v_max_i32_e32 v5, v5, v13 +; GCN-NEXT: v_max_i32_e32 v6, v6, v16 +; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v14 +; GCN-NEXT: v_cvt_f32_u32_e32 v14, v5 +; GCN-NEXT: v_cvt_f32_u32_e32 v16, v6 +; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 +; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v4 +; GCN-NEXT: v_rcp_iflag_f32_e32 v14, v14 +; GCN-NEXT: v_rcp_iflag_f32_e32 v16, v16 +; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 +; GCN-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 +; GCN-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16 +; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_u32_f32_e32 v16, v16 +; GCN-NEXT: v_mul_lo_u32 v13, v13, v12 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v2 +; GCN-NEXT: v_sub_i32_e32 v17, vcc, 0, v7 +; GCN-NEXT: v_max_i32_e32 v0, v0, v9 +; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v11 +; GCN-NEXT: v_max_i32_e32 v2, v2, v15 +; GCN-NEXT: v_max_i32_e32 v11, v7, v17 +; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v5 +; GCN-NEXT: v_sub_i32_e32 v17, vcc, 0, v6 +; GCN-NEXT: v_mul_lo_u32 v15, v15, v14 +; GCN-NEXT: v_mul_lo_u32 v17, v17, v16 +; GCN-NEXT: v_mul_hi_u32 v13, v12, v13 +; GCN-NEXT: v_cvt_f32_u32_e32 v18, v11 +; GCN-NEXT: v_mul_hi_u32 v15, v14, v15 +; GCN-NEXT: v_mul_hi_u32 v17, v16, v17 +; GCN-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GCN-NEXT: v_mul_hi_u32 v12, v0, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, v14, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, v16, v17 +; GCN-NEXT: v_mul_hi_u32 v13, v1, v13 +; GCN-NEXT: v_mul_hi_u32 v14, v2, v14 +; GCN-NEXT: v_mul_lo_u32 v15, v12, v4 +; GCN-NEXT: v_rcp_iflag_f32_e32 v18, v18 +; GCN-NEXT: v_mul_lo_u32 v17, v13, v5 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 1, v12 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v15 +; GCN-NEXT: v_mul_lo_u32 v15, v14, v6 +; GCN-NEXT: v_mul_f32_e32 v18, 0x4f7ffffe, v18 +; GCN-NEXT: v_cvt_u32_f32_e32 v18, v18 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v17 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v15 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 1, v13 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 1, v14 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; GCN-NEXT: v_mul_lo_u32 v0, v12, v5 -; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 -; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v12 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3] -; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v4, v4, v9 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3] -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v7 -; GCN-NEXT: v_max_i32_e32 v5, v7, v0 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, v5 -; GCN-NEXT: v_mul_hi_u32 v4, v9, v4 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 -; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 -; GCN-NEXT: v_max_i32_e32 v2, v2, v9 -; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 -; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v9, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1] +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 +; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GCN-NEXT: v_sub_i32_e32 v19, vcc, 0, v11 +; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v16, vcc, v0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v17, s[2:3] +; GCN-NEXT: v_sub_i32_e32 v17, vcc, v1, v5 +; GCN-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5] +; GCN-NEXT: v_sub_i32_e32 v15, vcc, v2, v6 +; GCN-NEXT: v_mul_lo_u32 v19, v19, v18 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[0:1] +; GCN-NEXT: v_add_i32_e32 v16, vcc, 1, v12 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[2:3] +; GCN-NEXT: v_add_i32_e32 v17, vcc, 1, v13 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v15, s[4:5] +; GCN-NEXT: v_add_i32_e32 v15, vcc, 1, v14 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v12, v16, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; GCN-NEXT: v_cndmask_b32_e32 v1, v13, v17, vcc ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v9 +; GCN-NEXT: v_mul_hi_u32 v4, v18, v19 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GCN-NEXT: v_mul_lo_u32 v8, v4, v6 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v1 -; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v5 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v10, v10, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc -; GCN-NEXT: v_mul_hi_u32 v4, v9, v10 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 -; GCN-NEXT: v_max_i32_e32 v6, v3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v6, v4 -; GCN-NEXT: v_xor_b32_e32 v2, v2, v14 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 -; GCN-NEXT: v_mul_lo_u32 v8, v4, v5 +; GCN-NEXT: v_cndmask_b32_e32 v2, v14, v15, vcc +; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v3 +; GCN-NEXT: v_max_i32_e32 v5, v3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v18, v4 +; GCN-NEXT: v_mul_hi_u32 v4, v5, v4 +; GCN-NEXT: v_xor_b32_e32 v2, v2, v10 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; GCN-NEXT: v_mul_lo_u32 v6, v4, v11 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 -; GCN-NEXT: v_sub_i32_e32 v8, vcc, v6, v5 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GCN-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 +; GCN-NEXT: v_sub_i32_e32 v7, vcc, v5, v11 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GCN-NEXT: v_xor_b32_e32 v4, v4, v3 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: sdiv_v4i32: ; TONGA: ; %bb.0: ; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; TONGA-NEXT: s_mov_b32 s7, 0xf000 -; TONGA-NEXT: s_mov_b32 s6, -1 -; TONGA-NEXT: s_mov_b32 s10, s6 -; TONGA-NEXT: s_mov_b32 s11, s7 +; TONGA-NEXT: s_mov_b32 s11, 0xf000 +; TONGA-NEXT: s_mov_b32 s10, -1 +; TONGA-NEXT: s_mov_b32 s6, s10 +; TONGA-NEXT: s_mov_b32 s7, s11 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s2 -; TONGA-NEXT: s_mov_b32 s9, s3 -; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; TONGA-NEXT: s_mov_b32 s4, s0 -; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: s_mov_b32 s4, s2 +; TONGA-NEXT: s_mov_b32 s5, s3 +; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; TONGA-NEXT: s_mov_b32 s8, s0 +; TONGA-NEXT: s_mov_b32 s9, s1 ; TONGA-NEXT: s_waitcnt vmcnt(1) -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0 +; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4 ; TONGA-NEXT: v_xor_b32_e32 v8, v0, v4 ; TONGA-NEXT: v_max_i32_e32 v4, v4, v10 -; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v4 -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v5 ; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; TONGA-NEXT: v_max_i32_e32 v5, v5, v13 -; TONGA-NEXT: v_cvt_f32_u32_e32 v13, v5 -; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v4 -; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 -; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v13, v13 -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1 -; TONGA-NEXT: v_mul_lo_u32 v16, v16, v10 -; TONGA-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13 -; TONGA-NEXT: v_cvt_u32_f32_e32 v13, v13 -; TONGA-NEXT: v_max_i32_e32 v0, v0, v9 -; TONGA-NEXT: v_mul_hi_u32 v16, v10, v16 ; TONGA-NEXT: v_max_i32_e32 v1, v1, v12 -; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v6 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v16 -; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v5 -; TONGA-NEXT: v_mul_lo_u32 v16, v16, v13 -; TONGA-NEXT: v_mul_hi_u32 v10, v0, v10 +; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v4 +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v5 +; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v6 ; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6 -; TONGA-NEXT: v_max_i32_e32 v6, v6, v15 -; TONGA-NEXT: v_mul_hi_u32 v12, v13, v16 -; TONGA-NEXT: v_cvt_f32_u32_e32 v15, v6 -; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v11 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v13, v12 -; TONGA-NEXT: v_mul_lo_u32 v13, v10, v4 -; TONGA-NEXT: v_mul_hi_u32 v12, v1, v12 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v15 -; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v14 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v13 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v0, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1] +; TONGA-NEXT: v_max_i32_e32 v5, v5, v13 +; TONGA-NEXT: v_max_i32_e32 v6, v6, v16 +; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v14 +; TONGA-NEXT: v_cvt_f32_u32_e32 v14, v5 +; TONGA-NEXT: v_cvt_f32_u32_e32 v16, v6 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12 +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v4 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v14, v14 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v16, v16 +; TONGA-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v12 +; TONGA-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 +; TONGA-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16 +; TONGA-NEXT: v_cvt_u32_f32_e32 v14, v14 +; TONGA-NEXT: v_cvt_u32_f32_e32 v16, v16 +; TONGA-NEXT: v_mul_lo_u32 v13, v13, v12 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0 +; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v2 +; TONGA-NEXT: v_sub_u32_e32 v17, vcc, 0, v7 +; TONGA-NEXT: v_max_i32_e32 v0, v0, v9 +; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v11 +; TONGA-NEXT: v_max_i32_e32 v2, v2, v15 +; TONGA-NEXT: v_max_i32_e32 v11, v7, v17 +; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v5 +; TONGA-NEXT: v_sub_u32_e32 v17, vcc, 0, v6 +; TONGA-NEXT: v_mul_lo_u32 v15, v15, v14 +; TONGA-NEXT: v_mul_lo_u32 v17, v17, v16 +; TONGA-NEXT: v_mul_hi_u32 v13, v12, v13 +; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v11 +; TONGA-NEXT: v_mul_hi_u32 v15, v14, v15 +; TONGA-NEXT: v_mul_hi_u32 v17, v16, v17 +; TONGA-NEXT: v_add_u32_e32 v12, vcc, v12, v13 +; TONGA-NEXT: v_mul_hi_u32 v12, v0, v12 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v14, v15 +; TONGA-NEXT: v_add_u32_e32 v14, vcc, v16, v17 +; TONGA-NEXT: v_mul_hi_u32 v13, v1, v13 +; TONGA-NEXT: v_mul_hi_u32 v14, v2, v14 +; TONGA-NEXT: v_mul_lo_u32 v15, v12, v4 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v18, v18 +; TONGA-NEXT: v_mul_lo_u32 v17, v13, v5 +; TONGA-NEXT: v_add_u32_e32 v16, vcc, 1, v12 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v15 +; TONGA-NEXT: v_mul_lo_u32 v15, v14, v6 +; TONGA-NEXT: v_mul_f32_e32 v18, 0x4f7ffffe, v18 +; TONGA-NEXT: v_cvt_u32_f32_e32 v18, v18 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v17 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v15 +; TONGA-NEXT: v_add_u32_e32 v17, vcc, 1, v13 +; TONGA-NEXT: v_add_u32_e32 v15, vcc, 1, v14 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; TONGA-NEXT: v_mul_lo_u32 v0, v12, v5 -; TONGA-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 -; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9 -; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v6 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v12 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3] -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v5 -; TONGA-NEXT: v_mul_lo_u32 v4, v4, v9 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, 0, v7 -; TONGA-NEXT: v_max_i32_e32 v5, v7, v0 -; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v5 -; TONGA-NEXT: v_mul_hi_u32 v4, v9, v4 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2 -; TONGA-NEXT: v_max_i32_e32 v2, v2, v9 -; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4 -; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v0 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; TONGA-NEXT: v_sub_u32_e32 v19, vcc, 0, v11 +; TONGA-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v16, vcc, v0, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v13, v13, v17, s[2:3] +; TONGA-NEXT: v_sub_u32_e32 v17, vcc, v1, v5 +; TONGA-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5] +; TONGA-NEXT: v_sub_u32_e32 v15, vcc, v2, v6 +; TONGA-NEXT: v_mul_lo_u32 v19, v19, v18 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[0:1] +; TONGA-NEXT: v_add_u32_e32 v16, vcc, 1, v12 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[2:3] +; TONGA-NEXT: v_add_u32_e32 v17, vcc, 1, v13 +; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v15, s[4:5] +; TONGA-NEXT: v_add_u32_e32 v15, vcc, 1, v14 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v12, v16, vcc +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; TONGA-NEXT: v_cndmask_b32_e32 v1, v13, v17, vcc ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 +; TONGA-NEXT: v_xor_b32_e32 v1, v1, v9 +; TONGA-NEXT: v_mul_hi_u32 v4, v18, v19 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 -; TONGA-NEXT: v_mul_lo_u32 v8, v4, v6 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v1 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v5 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v8 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] -; TONGA-NEXT: v_mul_lo_u32 v10, v10, v9 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v11 -; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v2, v6 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v11 -; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v9 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; TONGA-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc -; TONGA-NEXT: v_mul_hi_u32 v4, v9, v10 -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v3 -; TONGA-NEXT: v_max_i32_e32 v6, v3, v6 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 -; TONGA-NEXT: v_mul_hi_u32 v4, v6, v4 -; TONGA-NEXT: v_xor_b32_e32 v2, v2, v14 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v14 -; TONGA-NEXT: v_mul_lo_u32 v8, v4, v5 +; TONGA-NEXT: v_cndmask_b32_e32 v2, v14, v15, vcc +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v3 +; TONGA-NEXT: v_max_i32_e32 v5, v3, v5 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v18, v4 +; TONGA-NEXT: v_mul_hi_u32 v4, v5, v4 +; TONGA-NEXT: v_xor_b32_e32 v2, v2, v10 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v10 +; TONGA-NEXT: v_mul_lo_u32 v6, v4, v11 ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 -; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v8 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v6, v5 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 -; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v3 -; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v5, v6 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4 +; TONGA-NEXT: v_sub_u32_e32 v7, vcc, v5, v11 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v11 +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; TONGA-NEXT: v_xor_b32_e32 v4, v4, v3 ; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v4, v3 -; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v4i32: diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 19f0e93c308d8..934f8a2b160ee 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -873,20 +873,20 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; NOSDWA-NEXT: v_lshrrev_b32_e32 v13, 16, v5 ; NOSDWA-NEXT: v_mul_f16_e32 v1, v5, v1 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v14, 16, v4 ; NOSDWA-NEXT: v_mul_f16_e32 v0, v4, v0 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; NOSDWA-NEXT: v_mul_f16_e32 v10, v11, v10 +; NOSDWA-NEXT: v_mul_f16_e32 v4, v11, v10 ; NOSDWA-NEXT: v_mul_f16_e32 v7, v12, v7 ; NOSDWA-NEXT: v_mul_f16_e32 v6, v13, v6 -; NOSDWA-NEXT: v_mul_f16_e32 v4, v4, v5 -; NOSDWA-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; NOSDWA-NEXT: v_mul_f16_e32 v5, v14, v5 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; NOSDWA-NEXT: v_or_b32_e32 v3, v3, v5 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; NOSDWA-NEXT: v_or_b32_e32 v3, v3, v4 ; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v7 ; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v6 -; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v4 +; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v5 ; NOSDWA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; NOSDWA-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle.ll b/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle.ll index 5ff2f24d294dc..4bb7f2bdd21b4 100644 --- a/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle.ll +++ b/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc -mtriple=amdgcn -O1 -mcpu=gfx90a -debug-only=machine-scheduler -filetype=null < %s 2>&1 | FileCheck --check-prefix=DEBUG %s +; RUN: llc -mtriple=amdgcn -O1 -mcpu=gfx90a -debug-only=machine-scheduler -amdgpu-use-amdgpu-trackers=0 -filetype=null < %s 2>&1 | FileCheck --check-prefix=DEBUG %s ; DEBUG: Attempting to revert scheduling. diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index 3a2d056dc504a..357dc98c854ba 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -296,18 +296,18 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v16 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v8 ; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 +; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 +; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 +; GCN-NEXT: v_or_b32_e32 v11, v9, v11 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 -; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9 ; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc ; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 +; GCN-NEXT: v_cndmask_b32_e64 v2, v16, v2, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc ; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v9 ; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v12 @@ -345,18 +345,18 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 ; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 +; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 +; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v16 +; GCN-NEXT: v_or_b32_e32 v11, v9, v11 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 -; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9 ; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc ; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 +; GCN-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc ; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 @@ -394,18 +394,18 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 ; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 +; GCN-NEXT: v_subrev_i32_e64 v16, s[6:7], 64, v8 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 +; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v16 +; GCN-NEXT: v_or_b32_e32 v11, v9, v11 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 -; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9 ; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v16, v16, v18, vcc ; GCN-NEXT: v_sub_i32_e64 v9, s[6:7], 64, v12 +; GCN-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v19, vcc ; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index 28330bfc9bb69..ffae40c9514e9 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -911,20 +911,20 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; SI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshl_b64 v[5:6], v[5:6], v2 -; SI-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshl_b64 v[9:10], v[9:10], v13 -; SI-NEXT: v_lshl_b64 v[7:8], v[7:8], v11 -; SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 +; SI-NEXT: v_lshl_b64 v[6:7], v[6:7], v13 +; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], v11 +; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: shl_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll index dd42a1dd44320..f28541055b57e 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll @@ -321,15 +321,15 @@ define void @v_shuffle_v2i64_v4i64__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll index 7ee7c83e0122d..78d64cc814c4d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll @@ -295,10 +295,10 @@ define void @v_shuffle_v2i64_v8i64__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -591,10 +591,10 @@ define void @v_shuffle_v2i64_v8i64__15_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -637,15 +637,15 @@ define void @v_shuffle_v2i64_v8i64__15_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v16, v14 -; GFX900-NEXT: v_mov_b32_e32 v17, v15 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v18, v0 -; GFX900-NEXT: v_mov_b32_e32 v19, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v18, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v16 +; GFX900-NEXT: v_mov_b32_e32 v3, v17 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v18, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -699,13 +699,13 @@ define void @v_shuffle_v2i64_v8i64__15_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v20, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v0, v18 ; GFX900-NEXT: v_mov_b32_e32 v1, v19 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -755,13 +755,13 @@ define void @v_shuffle_v2i64_v8i64__15_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v22, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v2, v20 ; GFX900-NEXT: v_mov_b32_e32 v3, v21 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v22, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -811,13 +811,13 @@ define void @v_shuffle_v2i64_v8i64__15_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v24, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v4, v22 ; GFX900-NEXT: v_mov_b32_e32 v5, v23 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v24, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -867,13 +867,13 @@ define void @v_shuffle_v2i64_v8i64__15_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v26, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, v24 ; GFX900-NEXT: v_mov_b32_e32 v7, v25 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v26, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -923,13 +923,13 @@ define void @v_shuffle_v2i64_v8i64__15_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v28, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v8, v26 ; GFX900-NEXT: v_mov_b32_e32 v9, v27 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v28, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -979,13 +979,13 @@ define void @v_shuffle_v2i64_v8i64__15_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v30, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, v28 ; GFX900-NEXT: v_mov_b32_e32 v11, v29 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v30, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1035,13 +1035,13 @@ define void @v_shuffle_v2i64_v8i64__15_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v32, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[16:31] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, v30 ; GFX900-NEXT: v_mov_b32_e32 v13, v31 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v32, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1091,12 +1091,12 @@ define void @v_shuffle_v2i64_v8i64__15_8(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v14 ; GFX900-NEXT: v_mov_b32_e32 v3, v15 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1465,10 +1465,10 @@ define void @v_shuffle_v2i64_v8i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1825,12 +1825,12 @@ define void @v_shuffle_v2i64_v8i64__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v14 ; GFX900-NEXT: v_mov_b32_e32 v3, v15 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1876,10 +1876,10 @@ define void @v_shuffle_v2i64_v8i64__8_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1922,12 +1922,12 @@ define void @v_shuffle_v2i64_v8i64__9_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v18, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v18, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1978,12 +1978,12 @@ define void @v_shuffle_v2i64_v8i64__10_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v18, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2034,12 +2034,12 @@ define void @v_shuffle_v2i64_v8i64__11_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v18, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v18, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2090,12 +2090,12 @@ define void @v_shuffle_v2i64_v8i64__12_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v18, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v0 +; GFX900-NEXT: v_mov_b32_e32 v13, v1 +; GFX900-NEXT: global_store_dwordx4 v18, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2146,12 +2146,12 @@ define void @v_shuffle_v2i64_v8i64__13_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v18, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v0 +; GFX900-NEXT: v_mov_b32_e32 v15, v1 +; GFX900-NEXT: global_store_dwordx4 v18, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2202,12 +2202,12 @@ define void @v_shuffle_v2i64_v8i64__14_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v18, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, v0 +; GFX900-NEXT: v_mov_b32_e32 v17, v1 +; GFX900-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2687,15 +2687,15 @@ define void @v_shuffle_v2i64_v8i64__9_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v20, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v20, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2743,15 +2743,15 @@ define void @v_shuffle_v2i64_v8i64__10_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v20, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx4 v20, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2799,15 +2799,15 @@ define void @v_shuffle_v2i64_v8i64__11_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v20, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v2 +; GFX900-NEXT: v_mov_b32_e32 v13, v3 +; GFX900-NEXT: global_store_dwordx4 v20, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2855,15 +2855,15 @@ define void @v_shuffle_v2i64_v8i64__12_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v20, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v2 +; GFX900-NEXT: v_mov_b32_e32 v15, v3 +; GFX900-NEXT: global_store_dwordx4 v20, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2911,15 +2911,15 @@ define void @v_shuffle_v2i64_v8i64__13_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v20, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, v2 +; GFX900-NEXT: v_mov_b32_e32 v17, v3 +; GFX900-NEXT: global_store_dwordx4 v20, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2967,15 +2967,15 @@ define void @v_shuffle_v2i64_v8i64__14_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v20, 0 +; GFX900-NEXT: v_mov_b32_e32 v18, v2 +; GFX900-NEXT: v_mov_b32_e32 v19, v3 +; GFX900-NEXT: global_store_dwordx4 v20, v[16:19], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3455,15 +3455,15 @@ define void @v_shuffle_v2i64_v8i64__9_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v22, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v22, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3511,15 +3511,15 @@ define void @v_shuffle_v2i64_v8i64__10_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v22, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v4 +; GFX900-NEXT: v_mov_b32_e32 v13, v5 +; GFX900-NEXT: global_store_dwordx4 v22, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3567,15 +3567,15 @@ define void @v_shuffle_v2i64_v8i64__11_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v22, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v4 +; GFX900-NEXT: v_mov_b32_e32 v15, v5 +; GFX900-NEXT: global_store_dwordx4 v22, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3623,15 +3623,15 @@ define void @v_shuffle_v2i64_v8i64__12_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v22, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, v4 +; GFX900-NEXT: v_mov_b32_e32 v17, v5 +; GFX900-NEXT: global_store_dwordx4 v22, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3679,15 +3679,15 @@ define void @v_shuffle_v2i64_v8i64__13_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v22, 0 +; GFX900-NEXT: v_mov_b32_e32 v18, v4 +; GFX900-NEXT: v_mov_b32_e32 v19, v5 +; GFX900-NEXT: global_store_dwordx4 v22, v[16:19], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3735,15 +3735,15 @@ define void @v_shuffle_v2i64_v8i64__14_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v22, 0 +; GFX900-NEXT: v_mov_b32_e32 v20, v4 +; GFX900-NEXT: v_mov_b32_e32 v21, v5 +; GFX900-NEXT: global_store_dwordx4 v22, v[18:21], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4223,15 +4223,15 @@ define void @v_shuffle_v2i64_v8i64__9_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v24, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v24, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4279,15 +4279,15 @@ define void @v_shuffle_v2i64_v8i64__10_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v24, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v6 +; GFX900-NEXT: v_mov_b32_e32 v15, v7 +; GFX900-NEXT: global_store_dwordx4 v24, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4335,15 +4335,15 @@ define void @v_shuffle_v2i64_v8i64__11_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v24, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, v6 +; GFX900-NEXT: v_mov_b32_e32 v17, v7 +; GFX900-NEXT: global_store_dwordx4 v24, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4391,15 +4391,15 @@ define void @v_shuffle_v2i64_v8i64__12_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v24, 0 +; GFX900-NEXT: v_mov_b32_e32 v18, v6 +; GFX900-NEXT: v_mov_b32_e32 v19, v7 +; GFX900-NEXT: global_store_dwordx4 v24, v[16:19], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4447,15 +4447,15 @@ define void @v_shuffle_v2i64_v8i64__13_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v24, 0 +; GFX900-NEXT: v_mov_b32_e32 v20, v6 +; GFX900-NEXT: v_mov_b32_e32 v21, v7 +; GFX900-NEXT: global_store_dwordx4 v24, v[18:21], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4503,15 +4503,15 @@ define void @v_shuffle_v2i64_v8i64__14_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v24, 0 +; GFX900-NEXT: v_mov_b32_e32 v22, v6 +; GFX900-NEXT: v_mov_b32_e32 v23, v7 +; GFX900-NEXT: global_store_dwordx4 v24, v[20:23], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4991,15 +4991,15 @@ define void @v_shuffle_v2i64_v8i64__9_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v26, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v8 +; GFX900-NEXT: v_mov_b32_e32 v15, v9 +; GFX900-NEXT: global_store_dwordx4 v26, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5047,15 +5047,15 @@ define void @v_shuffle_v2i64_v8i64__10_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v26, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, v8 +; GFX900-NEXT: v_mov_b32_e32 v17, v9 +; GFX900-NEXT: global_store_dwordx4 v26, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5103,15 +5103,15 @@ define void @v_shuffle_v2i64_v8i64__11_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v26, 0 +; GFX900-NEXT: v_mov_b32_e32 v18, v8 +; GFX900-NEXT: v_mov_b32_e32 v19, v9 +; GFX900-NEXT: global_store_dwordx4 v26, v[16:19], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5159,15 +5159,15 @@ define void @v_shuffle_v2i64_v8i64__12_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v26, 0 +; GFX900-NEXT: v_mov_b32_e32 v20, v8 +; GFX900-NEXT: v_mov_b32_e32 v21, v9 +; GFX900-NEXT: global_store_dwordx4 v26, v[18:21], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5215,15 +5215,15 @@ define void @v_shuffle_v2i64_v8i64__13_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v26, 0 +; GFX900-NEXT: v_mov_b32_e32 v22, v8 +; GFX900-NEXT: v_mov_b32_e32 v23, v9 +; GFX900-NEXT: global_store_dwordx4 v26, v[20:23], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5271,15 +5271,15 @@ define void @v_shuffle_v2i64_v8i64__14_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v26, 0 +; GFX900-NEXT: v_mov_b32_e32 v24, v8 +; GFX900-NEXT: v_mov_b32_e32 v25, v9 +; GFX900-NEXT: global_store_dwordx4 v26, v[22:25], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5759,15 +5759,15 @@ define void @v_shuffle_v2i64_v8i64__9_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v28, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, v10 +; GFX900-NEXT: v_mov_b32_e32 v17, v11 +; GFX900-NEXT: global_store_dwordx4 v28, v[14:17], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5815,15 +5815,15 @@ define void @v_shuffle_v2i64_v8i64__10_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v28, 0 +; GFX900-NEXT: v_mov_b32_e32 v18, v10 +; GFX900-NEXT: v_mov_b32_e32 v19, v11 +; GFX900-NEXT: global_store_dwordx4 v28, v[16:19], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5871,15 +5871,15 @@ define void @v_shuffle_v2i64_v8i64__11_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v28, 0 +; GFX900-NEXT: v_mov_b32_e32 v20, v10 +; GFX900-NEXT: v_mov_b32_e32 v21, v11 +; GFX900-NEXT: global_store_dwordx4 v28, v[18:21], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5927,15 +5927,15 @@ define void @v_shuffle_v2i64_v8i64__12_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v28, 0 +; GFX900-NEXT: v_mov_b32_e32 v22, v10 +; GFX900-NEXT: v_mov_b32_e32 v23, v11 +; GFX900-NEXT: global_store_dwordx4 v28, v[20:23], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5983,15 +5983,15 @@ define void @v_shuffle_v2i64_v8i64__13_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v28, 0 +; GFX900-NEXT: v_mov_b32_e32 v24, v10 +; GFX900-NEXT: v_mov_b32_e32 v25, v11 +; GFX900-NEXT: global_store_dwordx4 v28, v[22:25], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6039,15 +6039,15 @@ define void @v_shuffle_v2i64_v8i64__14_5(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v28, 0 +; GFX900-NEXT: v_mov_b32_e32 v26, v10 +; GFX900-NEXT: v_mov_b32_e32 v27, v11 +; GFX900-NEXT: global_store_dwordx4 v28, v[24:27], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6527,15 +6527,15 @@ define void @v_shuffle_v2i64_v8i64__9_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v30, 0 +; GFX900-NEXT: v_mov_b32_e32 v18, v12 +; GFX900-NEXT: v_mov_b32_e32 v19, v13 +; GFX900-NEXT: global_store_dwordx4 v30, v[16:19], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6583,15 +6583,15 @@ define void @v_shuffle_v2i64_v8i64__10_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v30, 0 +; GFX900-NEXT: v_mov_b32_e32 v20, v12 +; GFX900-NEXT: v_mov_b32_e32 v21, v13 +; GFX900-NEXT: global_store_dwordx4 v30, v[18:21], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6639,15 +6639,15 @@ define void @v_shuffle_v2i64_v8i64__11_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v30, 0 +; GFX900-NEXT: v_mov_b32_e32 v22, v12 +; GFX900-NEXT: v_mov_b32_e32 v23, v13 +; GFX900-NEXT: global_store_dwordx4 v30, v[20:23], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6695,15 +6695,15 @@ define void @v_shuffle_v2i64_v8i64__12_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v30, 0 +; GFX900-NEXT: v_mov_b32_e32 v24, v12 +; GFX900-NEXT: v_mov_b32_e32 v25, v13 +; GFX900-NEXT: global_store_dwordx4 v30, v[22:25], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6751,15 +6751,15 @@ define void @v_shuffle_v2i64_v8i64__13_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v30, 0 +; GFX900-NEXT: v_mov_b32_e32 v26, v12 +; GFX900-NEXT: v_mov_b32_e32 v27, v13 +; GFX900-NEXT: global_store_dwordx4 v30, v[24:27], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6807,15 +6807,15 @@ define void @v_shuffle_v2i64_v8i64__14_6(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v30, 0 +; GFX900-NEXT: v_mov_b32_e32 v28, v12 +; GFX900-NEXT: v_mov_b32_e32 v29, v13 +; GFX900-NEXT: global_store_dwordx4 v30, v[26:29], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7295,15 +7295,15 @@ define void @v_shuffle_v2i64_v8i64__9_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[16:31] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v32, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, v16 -; GFX900-NEXT: v_mov_b32_e32 v5, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v20, v14 +; GFX900-NEXT: v_mov_b32_e32 v21, v15 +; GFX900-NEXT: global_store_dwordx4 v32, v[18:21], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7351,15 +7351,15 @@ define void @v_shuffle_v2i64_v8i64__10_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[16:31] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v32, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, v16 -; GFX900-NEXT: v_mov_b32_e32 v7, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v22, v14 +; GFX900-NEXT: v_mov_b32_e32 v23, v15 +; GFX900-NEXT: global_store_dwordx4 v32, v[20:23], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7407,15 +7407,15 @@ define void @v_shuffle_v2i64_v8i64__11_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[16:31] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v32, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v16 -; GFX900-NEXT: v_mov_b32_e32 v9, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v24, v14 +; GFX900-NEXT: v_mov_b32_e32 v25, v15 +; GFX900-NEXT: global_store_dwordx4 v32, v[22:25], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7463,15 +7463,15 @@ define void @v_shuffle_v2i64_v8i64__12_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[16:31] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v32, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v16 -; GFX900-NEXT: v_mov_b32_e32 v11, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v26, v14 +; GFX900-NEXT: v_mov_b32_e32 v27, v15 +; GFX900-NEXT: global_store_dwordx4 v32, v[24:27], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7519,15 +7519,15 @@ define void @v_shuffle_v2i64_v8i64__13_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[16:31] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v32, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, v16 -; GFX900-NEXT: v_mov_b32_e32 v13, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v28, v14 +; GFX900-NEXT: v_mov_b32_e32 v29, v15 +; GFX900-NEXT: global_store_dwordx4 v32, v[26:29], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7575,15 +7575,15 @@ define void @v_shuffle_v2i64_v8i64__14_7(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ; def v[16:31] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v32, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, v16 -; GFX900-NEXT: v_mov_b32_e32 v15, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v30, v14 +; GFX900-NEXT: v_mov_b32_e32 v31, v15 +; GFX900-NEXT: global_store_dwordx4 v32, v[28:31], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7917,10 +7917,10 @@ define void @v_shuffle_v2i64_v8i64__7_8(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8292,10 +8292,10 @@ define void @v_shuffle_v2i64_v8i64__0_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8348,10 +8348,10 @@ define void @v_shuffle_v2i64_v8i64__1_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v20, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v20, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8404,10 +8404,10 @@ define void @v_shuffle_v2i64_v8i64__2_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v22, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v8 ; GFX900-NEXT: v_mov_b32_e32 v7, v9 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8460,10 +8460,10 @@ define void @v_shuffle_v2i64_v8i64__3_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v24, 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v10 ; GFX900-NEXT: v_mov_b32_e32 v9, v11 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v24, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8516,10 +8516,10 @@ define void @v_shuffle_v2i64_v8i64__4_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v26, 0 ; GFX900-NEXT: v_mov_b32_e32 v10, v12 ; GFX900-NEXT: v_mov_b32_e32 v11, v13 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v26, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8572,10 +8572,10 @@ define void @v_shuffle_v2i64_v8i64__5_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v28, 0 ; GFX900-NEXT: v_mov_b32_e32 v12, v14 ; GFX900-NEXT: v_mov_b32_e32 v13, v15 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v28, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8628,10 +8628,10 @@ define void @v_shuffle_v2i64_v8i64__6_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v30, 0 ; GFX900-NEXT: v_mov_b32_e32 v14, v16 ; GFX900-NEXT: v_mov_b32_e32 v15, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v30, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8684,10 +8684,10 @@ define void @v_shuffle_v2i64_v8i64__7_9(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v16 ; GFX900-NEXT: v_mov_b32_e32 v1, v17 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9096,10 +9096,10 @@ define void @v_shuffle_v2i64_v8i64__0_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v6 ; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9152,10 +9152,10 @@ define void @v_shuffle_v2i64_v8i64__1_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v20, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v8 ; GFX900-NEXT: v_mov_b32_e32 v5, v9 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v20, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9208,10 +9208,10 @@ define void @v_shuffle_v2i64_v8i64__2_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v22, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v10 ; GFX900-NEXT: v_mov_b32_e32 v7, v11 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9264,10 +9264,10 @@ define void @v_shuffle_v2i64_v8i64__3_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v24, 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v12 ; GFX900-NEXT: v_mov_b32_e32 v9, v13 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v24, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9320,10 +9320,10 @@ define void @v_shuffle_v2i64_v8i64__4_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v26, 0 ; GFX900-NEXT: v_mov_b32_e32 v10, v14 ; GFX900-NEXT: v_mov_b32_e32 v11, v15 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v26, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9376,10 +9376,10 @@ define void @v_shuffle_v2i64_v8i64__5_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v28, 0 ; GFX900-NEXT: v_mov_b32_e32 v12, v16 ; GFX900-NEXT: v_mov_b32_e32 v13, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v28, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9432,10 +9432,10 @@ define void @v_shuffle_v2i64_v8i64__6_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v30, 0 ; GFX900-NEXT: v_mov_b32_e32 v14, v18 ; GFX900-NEXT: v_mov_b32_e32 v15, v19 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v30, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9488,10 +9488,10 @@ define void @v_shuffle_v2i64_v8i64__7_10(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v16 ; GFX900-NEXT: v_mov_b32_e32 v3, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9900,10 +9900,10 @@ define void @v_shuffle_v2i64_v8i64__0_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v8 ; GFX900-NEXT: v_mov_b32_e32 v3, v9 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9956,10 +9956,10 @@ define void @v_shuffle_v2i64_v8i64__1_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v20, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v10 ; GFX900-NEXT: v_mov_b32_e32 v5, v11 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v20, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10012,10 +10012,10 @@ define void @v_shuffle_v2i64_v8i64__2_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v22, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v12 ; GFX900-NEXT: v_mov_b32_e32 v7, v13 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10068,10 +10068,10 @@ define void @v_shuffle_v2i64_v8i64__3_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v24, 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v14 ; GFX900-NEXT: v_mov_b32_e32 v9, v15 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v24, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10124,10 +10124,10 @@ define void @v_shuffle_v2i64_v8i64__4_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v26, 0 ; GFX900-NEXT: v_mov_b32_e32 v10, v16 ; GFX900-NEXT: v_mov_b32_e32 v11, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v26, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10180,10 +10180,10 @@ define void @v_shuffle_v2i64_v8i64__5_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v28, 0 ; GFX900-NEXT: v_mov_b32_e32 v12, v18 ; GFX900-NEXT: v_mov_b32_e32 v13, v19 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v28, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10236,10 +10236,10 @@ define void @v_shuffle_v2i64_v8i64__6_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v30, 0 ; GFX900-NEXT: v_mov_b32_e32 v14, v20 ; GFX900-NEXT: v_mov_b32_e32 v15, v21 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v30, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10292,10 +10292,10 @@ define void @v_shuffle_v2i64_v8i64__7_11(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v16 ; GFX900-NEXT: v_mov_b32_e32 v5, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10704,10 +10704,10 @@ define void @v_shuffle_v2i64_v8i64__0_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v10 ; GFX900-NEXT: v_mov_b32_e32 v3, v11 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10760,10 +10760,10 @@ define void @v_shuffle_v2i64_v8i64__1_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v20, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v12 ; GFX900-NEXT: v_mov_b32_e32 v5, v13 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v20, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10816,10 +10816,10 @@ define void @v_shuffle_v2i64_v8i64__2_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v22, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v14 ; GFX900-NEXT: v_mov_b32_e32 v7, v15 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10872,10 +10872,10 @@ define void @v_shuffle_v2i64_v8i64__3_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v24, 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v16 ; GFX900-NEXT: v_mov_b32_e32 v9, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v24, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10928,10 +10928,10 @@ define void @v_shuffle_v2i64_v8i64__4_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v26, 0 ; GFX900-NEXT: v_mov_b32_e32 v10, v18 ; GFX900-NEXT: v_mov_b32_e32 v11, v19 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v26, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10984,10 +10984,10 @@ define void @v_shuffle_v2i64_v8i64__5_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v28, 0 ; GFX900-NEXT: v_mov_b32_e32 v12, v20 ; GFX900-NEXT: v_mov_b32_e32 v13, v21 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v28, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11040,10 +11040,10 @@ define void @v_shuffle_v2i64_v8i64__6_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v30, 0 ; GFX900-NEXT: v_mov_b32_e32 v14, v22 ; GFX900-NEXT: v_mov_b32_e32 v15, v23 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v30, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11096,10 +11096,10 @@ define void @v_shuffle_v2i64_v8i64__7_12(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v16 ; GFX900-NEXT: v_mov_b32_e32 v7, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11508,10 +11508,10 @@ define void @v_shuffle_v2i64_v8i64__0_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v12 ; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11564,10 +11564,10 @@ define void @v_shuffle_v2i64_v8i64__1_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v20, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v14 ; GFX900-NEXT: v_mov_b32_e32 v5, v15 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v20, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11620,10 +11620,10 @@ define void @v_shuffle_v2i64_v8i64__2_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v22, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v16 ; GFX900-NEXT: v_mov_b32_e32 v7, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11676,10 +11676,10 @@ define void @v_shuffle_v2i64_v8i64__3_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v24, 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v18 ; GFX900-NEXT: v_mov_b32_e32 v9, v19 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v24, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11732,10 +11732,10 @@ define void @v_shuffle_v2i64_v8i64__4_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v26, 0 ; GFX900-NEXT: v_mov_b32_e32 v10, v20 ; GFX900-NEXT: v_mov_b32_e32 v11, v21 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v26, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11788,10 +11788,10 @@ define void @v_shuffle_v2i64_v8i64__5_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v28, 0 ; GFX900-NEXT: v_mov_b32_e32 v12, v22 ; GFX900-NEXT: v_mov_b32_e32 v13, v23 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v28, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11844,10 +11844,10 @@ define void @v_shuffle_v2i64_v8i64__6_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v30, 0 ; GFX900-NEXT: v_mov_b32_e32 v14, v24 ; GFX900-NEXT: v_mov_b32_e32 v15, v25 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v30, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -11900,10 +11900,10 @@ define void @v_shuffle_v2i64_v8i64__7_13(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v16 ; GFX900-NEXT: v_mov_b32_e32 v9, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12312,10 +12312,10 @@ define void @v_shuffle_v2i64_v8i64__0_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v14 ; GFX900-NEXT: v_mov_b32_e32 v3, v15 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12368,10 +12368,10 @@ define void @v_shuffle_v2i64_v8i64__1_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v20, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v16 ; GFX900-NEXT: v_mov_b32_e32 v5, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v20, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12424,10 +12424,10 @@ define void @v_shuffle_v2i64_v8i64__2_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v22, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v18 ; GFX900-NEXT: v_mov_b32_e32 v7, v19 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12480,10 +12480,10 @@ define void @v_shuffle_v2i64_v8i64__3_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v24, 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v20 ; GFX900-NEXT: v_mov_b32_e32 v9, v21 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v24, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12536,10 +12536,10 @@ define void @v_shuffle_v2i64_v8i64__4_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v26, 0 ; GFX900-NEXT: v_mov_b32_e32 v10, v22 ; GFX900-NEXT: v_mov_b32_e32 v11, v23 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v26, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12592,10 +12592,10 @@ define void @v_shuffle_v2i64_v8i64__5_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v28, 0 ; GFX900-NEXT: v_mov_b32_e32 v12, v24 ; GFX900-NEXT: v_mov_b32_e32 v13, v25 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v28, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12648,10 +12648,10 @@ define void @v_shuffle_v2i64_v8i64__6_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v30, 0 ; GFX900-NEXT: v_mov_b32_e32 v14, v26 ; GFX900-NEXT: v_mov_b32_e32 v15, v27 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v30, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12704,10 +12704,10 @@ define void @v_shuffle_v2i64_v8i64__7_14(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: v_mov_b32_e32 v10, v16 ; GFX900-NEXT: v_mov_b32_e32 v11, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -13116,10 +13116,10 @@ define void @v_shuffle_v2i64_v8i64__0_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v16 ; GFX900-NEXT: v_mov_b32_e32 v3, v17 -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -13172,10 +13172,10 @@ define void @v_shuffle_v2i64_v8i64__1_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v20, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v18 ; GFX900-NEXT: v_mov_b32_e32 v5, v19 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v20, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -13228,10 +13228,10 @@ define void @v_shuffle_v2i64_v8i64__2_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:21] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v22, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v20 ; GFX900-NEXT: v_mov_b32_e32 v7, v21 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -13284,10 +13284,10 @@ define void @v_shuffle_v2i64_v8i64__3_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:23] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v24, 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v22 ; GFX900-NEXT: v_mov_b32_e32 v9, v23 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v24, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -13340,10 +13340,10 @@ define void @v_shuffle_v2i64_v8i64__4_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[10:25] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v26, 0 ; GFX900-NEXT: v_mov_b32_e32 v10, v24 ; GFX900-NEXT: v_mov_b32_e32 v11, v25 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v26, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -13396,10 +13396,10 @@ define void @v_shuffle_v2i64_v8i64__5_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[12:27] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v28, 0 ; GFX900-NEXT: v_mov_b32_e32 v12, v26 ; GFX900-NEXT: v_mov_b32_e32 v13, v27 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v28, v[10:13], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -13452,10 +13452,10 @@ define void @v_shuffle_v2i64_v8i64__6_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[14:29] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v30, 0 ; GFX900-NEXT: v_mov_b32_e32 v14, v28 ; GFX900-NEXT: v_mov_b32_e32 v15, v29 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v30, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -13508,10 +13508,10 @@ define void @v_shuffle_v2i64_v8i64__7_15(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, 0 ; GFX900-NEXT: v_mov_b32_e32 v12, v16 ; GFX900-NEXT: v_mov_b32_e32 v13, v17 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v18, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll index ae31524ebaa7f..4b1f52cc3bb36 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll @@ -321,15 +321,15 @@ define void @v_shuffle_v2p0_v4p0__7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll index 6e156d2d4a2f5..fc77dcd39f1b8 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll @@ -324,15 +324,15 @@ define void @v_shuffle_v3i64_v4i64__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1582,16 +1582,16 @@ define void @v_shuffle_v3i64_v4i64__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2558,15 +2558,15 @@ define void @v_shuffle_v3i64_v4i64__7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3418,19 +3418,19 @@ define void @v_shuffle_v3i64_v4i64__7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4283,16 +4283,16 @@ define void @v_shuffle_v3i64_v4i64__7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5029,19 +5029,19 @@ define void @v_shuffle_v3i64_v4i64__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5879,19 +5879,19 @@ define void @v_shuffle_v3i64_v4i64__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6735,19 +6735,19 @@ define void @v_shuffle_v3i64_v4i64__7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7542,19 +7542,19 @@ define void @v_shuffle_v3i64_v4i64__7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll index b03066e66cf66..da03a6aadb726 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll @@ -324,15 +324,15 @@ define void @v_shuffle_v3p0_v4p0__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1582,16 +1582,16 @@ define void @v_shuffle_v3p0_v4p0__7_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2558,15 +2558,15 @@ define void @v_shuffle_v3p0_v4p0__7_0_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3418,19 +3418,19 @@ define void @v_shuffle_v3p0_v4p0__7_0_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4283,16 +4283,16 @@ define void @v_shuffle_v3p0_v4p0__7_0_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5029,19 +5029,19 @@ define void @v_shuffle_v3p0_v4p0__7_0_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5879,19 +5879,19 @@ define void @v_shuffle_v3p0_v4p0__7_0_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6735,19 +6735,19 @@ define void @v_shuffle_v3p0_v4p0__7_0_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_6: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7542,19 +7542,19 @@ define void @v_shuffle_v3p0_v4p0__7_0_7(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_7: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll index 32f6e00716e37..70eb64e22598f 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll @@ -324,15 +324,15 @@ define void @v_shuffle_v4i64_v4i64__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,16 +918,16 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1190,18 +1190,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2123,18 +2123,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2262,18 +2262,17 @@ define void @v_shuffle_v4i64_v4i64__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: v_mov_b32_e32 v6, v0 ; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2736,16 +2735,16 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2931,18 +2930,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3134,18 +3133,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3765,18 +3764,19 @@ define void @v_shuffle_v4i64_v4i64__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5246,18 +5246,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6718,18 +6718,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6852,19 +6852,19 @@ define void @v_shuffle_v4i64_v4i64__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: v_mov_b32_e32 v2, v6 ; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8043,18 +8043,19 @@ define void @v_shuffle_v4i64_v4i64__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8713,18 +8714,18 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9476,18 +9477,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10907,18 +10908,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12302,18 +12303,18 @@ define void @v_shuffle_v4i64_v4i64__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll index 257af574366a6..786beb13cd4e0 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll @@ -324,15 +324,15 @@ define void @v_shuffle_v4p0_v4p0__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -918,16 +918,16 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1190,18 +1190,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_7_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2123,18 +2123,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2262,18 +2262,17 @@ define void @v_shuffle_v4p0_v4p0__7_1_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: v_mov_b32_e32 v6, v0 ; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:11] -; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2736,16 +2735,16 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v10, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2931,18 +2930,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3134,18 +3133,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_6_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v8, v4 -; GFX900-NEXT: v_mov_b32_e32 v9, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3765,18 +3764,19 @@ define void @v_shuffle_v4p0_v4p0__7_0_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5246,18 +5246,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6718,18 +6718,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6852,19 +6852,19 @@ define void @v_shuffle_v4p0_v4p0__7_2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: v_mov_b32_e32 v2, v6 ; GFX900-NEXT: v_mov_b32_e32 v3, v7 -; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v12 -; GFX900-NEXT: v_mov_b32_e32 v3, v13 -; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8043,18 +8043,19 @@ define void @v_shuffle_v4p0_v4p0__7_0_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -8713,18 +8714,18 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -9476,18 +9477,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -10907,18 +10908,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -12302,18 +12303,18 @@ define void @v_shuffle_v4p0_v4p0__7_0_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v6 -; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v8, v6 -; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, v0 -; GFX900-NEXT: v_mov_b32_e32 v11, v1 -; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll index c90d7887f2ff6..135c15a05cb15 100644 --- a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll +++ b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=0 < %s | FileCheck %s define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) { ; CHECK-LABEL: excess_soft_clause_reg_pressure: diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index 1a0f75e048cb9..88785484d7ab0 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -45,10 +45,10 @@ define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ; GFX908-NEXT: v_accvgpr_write_b32 a6, v3 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a4, v1 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 ; GFX908-NEXT: v_mov_b32_e32 v1, v0 ; GFX908-NEXT: v_mov_b32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v3, v0 -; GFX908-NEXT: v_mov_b32_e32 v4, 0 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index 80c0d0f45eb97..a843b4cfb3f0d 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -602,46 +602,46 @@ define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; SI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_ashr_i64 v[5:6], v[5:6], v2 -; SI-NEXT: v_ashr_i64 v[3:4], v[3:4], v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_ashr_i64 v[9:10], v[9:10], v13 -; SI-NEXT: v_ashr_i64 v[7:8], v[7:8], v11 -; SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 +; SI-NEXT: v_ashr_i64 v[6:7], v[6:7], v13 +; SI-NEXT: v_ashr_i64 v[4:5], v[4:5], v11 +; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v8 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: ashr_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; VI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16 -; VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 -; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_ashrrev_i64 v[5:6], v2, v[5:6] -; VI-NEXT: v_ashrrev_i64 v[3:4], v0, v[3:4] +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_ashrrev_i64 v[2:3], v10, v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ashrrev_i64 v[9:10], v13, v[9:10] -; VI-NEXT: v_ashrrev_i64 v[7:8], v11, v[7:8] -; VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 +; VI-NEXT: v_ashrrev_i64 v[6:7], v13, v[6:7] +; VI-NEXT: v_ashrrev_i64 v[4:5], v11, v[4:5] +; VI-NEXT: v_ashrrev_i64 v[0:1], v8, v[0:1] +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: ashr_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index e12e31b14e97d..6e89ba2be67f3 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -5548,24 +5548,24 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: v_madmk_f32 v11, v18, 0xcf800000, v11 ; TAHITI-NEXT: v_cvt_u32_f32_e32 v18, v18 ; TAHITI-NEXT: v_cvt_u32_f32_e32 v11, v11 -; TAHITI-NEXT: v_mul_lo_u32 v22, v19, v18 -; TAHITI-NEXT: v_mul_hi_u32 v21, v19, v11 +; TAHITI-NEXT: v_mul_lo_u32 v21, v19, v18 +; TAHITI-NEXT: v_mul_hi_u32 v22, v19, v11 ; TAHITI-NEXT: v_mul_lo_u32 v23, v20, v11 -; TAHITI-NEXT: v_add_i32_e32 v21, vcc, v22, v21 +; TAHITI-NEXT: v_add_i32_e32 v21, vcc, v21, v22 ; TAHITI-NEXT: v_mul_lo_u32 v22, v19, v11 ; TAHITI-NEXT: v_add_i32_e32 v21, vcc, v21, v23 ; TAHITI-NEXT: v_mul_lo_u32 v23, v11, v21 ; TAHITI-NEXT: v_mul_hi_u32 v24, v11, v22 ; TAHITI-NEXT: v_mul_hi_u32 v25, v11, v21 -; TAHITI-NEXT: v_mul_hi_u32 v26, v18, v21 -; TAHITI-NEXT: v_mul_lo_u32 v21, v18, v21 ; TAHITI-NEXT: v_add_i32_e32 v23, vcc, v24, v23 ; TAHITI-NEXT: v_addc_u32_e32 v24, vcc, 0, v25, vcc ; TAHITI-NEXT: v_mul_lo_u32 v25, v18, v22 ; TAHITI-NEXT: v_mul_hi_u32 v22, v18, v22 ; TAHITI-NEXT: v_add_i32_e32 v23, vcc, v23, v25 +; TAHITI-NEXT: v_mul_hi_u32 v23, v18, v21 +; TAHITI-NEXT: v_mul_lo_u32 v21, v18, v21 ; TAHITI-NEXT: v_addc_u32_e32 v22, vcc, v24, v22, vcc -; TAHITI-NEXT: v_addc_u32_e32 v23, vcc, 0, v26, vcc +; TAHITI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc ; TAHITI-NEXT: v_add_i32_e32 v21, vcc, v22, v21 ; TAHITI-NEXT: v_addc_u32_e32 v22, vcc, 0, v23, vcc ; TAHITI-NEXT: v_add_i32_e32 v11, vcc, v11, v21 @@ -5576,17 +5576,17 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: v_mul_lo_u32 v19, v19, v11 ; TAHITI-NEXT: v_add_i32_e32 v21, vcc, v21, v22 ; TAHITI-NEXT: v_add_i32_e32 v20, vcc, v20, v21 -; TAHITI-NEXT: v_mul_lo_u32 v23, v11, v20 -; TAHITI-NEXT: v_mul_hi_u32 v24, v11, v19 -; TAHITI-NEXT: v_mul_hi_u32 v25, v11, v20 -; TAHITI-NEXT: v_mul_hi_u32 v22, v18, v19 -; TAHITI-NEXT: v_mul_lo_u32 v19, v18, v19 +; TAHITI-NEXT: v_mul_lo_u32 v21, v11, v20 +; TAHITI-NEXT: v_mul_hi_u32 v22, v11, v19 +; TAHITI-NEXT: v_mul_hi_u32 v23, v11, v20 +; TAHITI-NEXT: v_add_i32_e32 v21, vcc, v22, v21 +; TAHITI-NEXT: v_mul_lo_u32 v22, v18, v19 +; TAHITI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; TAHITI-NEXT: v_mul_hi_u32 v19, v18, v19 +; TAHITI-NEXT: v_add_i32_e32 v21, vcc, v21, v22 ; TAHITI-NEXT: v_mul_hi_u32 v21, v18, v20 -; TAHITI-NEXT: v_add_i32_e32 v23, vcc, v24, v23 -; TAHITI-NEXT: v_addc_u32_e32 v24, vcc, 0, v25, vcc ; TAHITI-NEXT: v_mul_lo_u32 v20, v18, v20 -; TAHITI-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; TAHITI-NEXT: v_addc_u32_e32 v19, vcc, v24, v22, vcc +; TAHITI-NEXT: v_addc_u32_e32 v19, vcc, v23, v19, vcc ; TAHITI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc ; TAHITI-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; TAHITI-NEXT: v_addc_u32_e32 v20, vcc, 0, v21, vcc @@ -5597,18 +5597,18 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: v_xor_b32_e32 v20, v20, v19 ; TAHITI-NEXT: v_mul_lo_u32 v21, v20, v18 ; TAHITI-NEXT: v_mul_hi_u32 v22, v20, v11 -; TAHITI-NEXT: v_mul_hi_u32 v23, v20, v18 ; TAHITI-NEXT: v_addc_u32_e32 v15, vcc, v15, v19, vcc +; TAHITI-NEXT: v_mul_hi_u32 v23, v20, v18 ; TAHITI-NEXT: v_xor_b32_e32 v15, v15, v19 ; TAHITI-NEXT: v_add_i32_e32 v21, vcc, v22, v21 -; TAHITI-NEXT: v_addc_u32_e32 v22, vcc, 0, v23, vcc -; TAHITI-NEXT: v_mul_lo_u32 v23, v15, v11 +; TAHITI-NEXT: v_mul_lo_u32 v22, v15, v11 +; TAHITI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc ; TAHITI-NEXT: v_mul_hi_u32 v11, v15, v11 -; TAHITI-NEXT: v_mul_hi_u32 v24, v15, v18 +; TAHITI-NEXT: v_add_i32_e32 v21, vcc, v21, v22 +; TAHITI-NEXT: v_mul_hi_u32 v21, v15, v18 ; TAHITI-NEXT: v_mul_lo_u32 v18, v15, v18 -; TAHITI-NEXT: v_add_i32_e32 v21, vcc, v21, v23 -; TAHITI-NEXT: v_addc_u32_e32 v11, vcc, v22, v11, vcc -; TAHITI-NEXT: v_addc_u32_e32 v21, vcc, 0, v24, vcc +; TAHITI-NEXT: v_addc_u32_e32 v11, vcc, v23, v11, vcc +; TAHITI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc ; TAHITI-NEXT: v_add_i32_e32 v11, vcc, v11, v18 ; TAHITI-NEXT: v_addc_u32_e32 v18, vcc, 0, v21, vcc ; TAHITI-NEXT: v_mul_lo_u32 v18, v9, v18 @@ -5700,15 +5700,15 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: v_mul_lo_u32 v21, v13, v19 ; TAHITI-NEXT: v_mul_hi_u32 v22, v13, v20 ; TAHITI-NEXT: v_mul_hi_u32 v23, v13, v19 -; TAHITI-NEXT: v_mul_hi_u32 v24, v14, v19 -; TAHITI-NEXT: v_mul_lo_u32 v19, v14, v19 ; TAHITI-NEXT: v_add_i32_e32 v21, vcc, v22, v21 -; TAHITI-NEXT: v_addc_u32_e32 v22, vcc, 0, v23, vcc -; TAHITI-NEXT: v_mul_lo_u32 v23, v14, v20 +; TAHITI-NEXT: v_mul_lo_u32 v22, v14, v20 +; TAHITI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc ; TAHITI-NEXT: v_mul_hi_u32 v20, v14, v20 -; TAHITI-NEXT: v_add_i32_e32 v21, vcc, v21, v23 -; TAHITI-NEXT: v_addc_u32_e32 v20, vcc, v22, v20, vcc -; TAHITI-NEXT: v_addc_u32_e32 v21, vcc, 0, v24, vcc +; TAHITI-NEXT: v_add_i32_e32 v21, vcc, v21, v22 +; TAHITI-NEXT: v_mul_hi_u32 v21, v14, v19 +; TAHITI-NEXT: v_mul_lo_u32 v19, v14, v19 +; TAHITI-NEXT: v_addc_u32_e32 v20, vcc, v23, v20, vcc +; TAHITI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc ; TAHITI-NEXT: v_add_i32_e32 v19, vcc, v20, v19 ; TAHITI-NEXT: v_addc_u32_e32 v20, vcc, 0, v21, vcc ; TAHITI-NEXT: v_add_i32_e32 v13, vcc, v13, v19 @@ -6181,30 +6181,30 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc ; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v10 ; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v15, v8 -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, v14, v9, vcc -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], s1, v18, 0 -; TONGA-NEXT: v_mul_lo_u32 v14, s1, v19 -; TONGA-NEXT: v_mul_lo_u32 v15, s10, v18 -; TONGA-NEXT: v_mul_hi_u32 v20, v18, v8 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v19, v8, 0 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v14, v9 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v15, v9 -; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[8:9], v18, v9, 0 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], v19, v9, 0 -; TONGA-NEXT: v_add_u32_e32 v14, vcc, v20, v14 -; TONGA-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v14, v10 -; TONGA-NEXT: v_addc_u32_e32 v10, vcc, v15, v11, vcc -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v10, v8 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v15, v8 +; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v14, v9, vcc +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], s1, v11, 0 +; TONGA-NEXT: v_mul_lo_u32 v10, s1, v20 +; TONGA-NEXT: v_mul_lo_u32 v14, s10, v11 +; TONGA-NEXT: v_mul_hi_u32 v21, v11, v8 ; TONGA-NEXT: s_ashr_i32 s10, s3, 31 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v9 +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v14, v9 +; TONGA-NEXT: v_mad_u64_u32 v[9:10], s[8:9], v11, v18, 0 +; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[8:9], v20, v8, 0 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v20, v18, 0 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v21, v9 +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v14 +; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v9, v15, vcc +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v19, vcc +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v18 ; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; TONGA-NEXT: s_add_u32 s8, s2, s10 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v18, v8 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v11, v8 ; TONGA-NEXT: s_mov_b32 s11, s10 ; TONGA-NEXT: s_addc_u32 s9, s3, s10 -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v19, v9, vcc +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v20, v9, vcc ; TONGA-NEXT: s_xor_b64 s[12:13], s[8:9], s[10:11] ; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], s12, v11, 0 ; TONGA-NEXT: v_mul_hi_u32 v14, s12, v10 @@ -6321,74 +6321,74 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_lo_u32 v18, v23, v21 ; TONGA-NEXT: v_add_u32_e32 v14, vcc, v14, v15 ; TONGA-NEXT: v_add_u32_e32 v18, vcc, v14, v18 -; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v21, v18, 0 ; TONGA-NEXT: v_mul_hi_u32 v19, v21, v13 -; TONGA-NEXT: v_add_u32_e32 v24, vcc, v19, v14 +; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v21, v18, 0 +; TONGA-NEXT: v_add_u32_e32 v19, vcc, v19, v14 ; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v20, v13, 0 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v20, v18, 0 ; TONGA-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v24, v13 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v19, v13 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v20, v18, 0 ; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v15, v14, vcc ; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v19, vcc ; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v18 ; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; TONGA-NEXT: v_add_u32_e32 v24, vcc, v21, v13 -; TONGA-NEXT: v_addc_u32_e32 v25, vcc, v20, v14, vcc -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v22, v24, 0 -; TONGA-NEXT: v_mul_lo_u32 v15, v22, v25 -; TONGA-NEXT: v_mul_lo_u32 v20, v23, v24 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v25, v13, 0 +; TONGA-NEXT: v_add_u32_e32 v21, vcc, v21, v13 +; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v20, v14, vcc +; TONGA-NEXT: v_mul_lo_u32 v15, v22, v20 +; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v22, v21, 0 +; TONGA-NEXT: v_mul_lo_u32 v18, v23, v21 ; TONGA-NEXT: v_add_u32_e32 v14, vcc, v15, v14 -; TONGA-NEXT: v_add_u32_e32 v20, vcc, v20, v14 -; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v24, v20, 0 -; TONGA-NEXT: v_mul_hi_u32 v13, v24, v13 -; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v25, v20, 0 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v14 -; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v15, vcc +; TONGA-NEXT: v_add_u32_e32 v22, vcc, v18, v14 +; TONGA-NEXT: v_mul_hi_u32 v23, v21, v13 +; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v21, v22, 0 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v20, v13, 0 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v23, v14 +; TONGA-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc ; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v18 -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v14, v19, vcc -; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v21, vcc -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v20 +; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v20, v22, 0 +; TONGA-NEXT: v_addc_u32_e32 v15, vcc, v15, v19, vcc ; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; TONGA-NEXT: v_add_u32_e32 v15, vcc, v24, v13 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v25, v14, vcc -; TONGA-NEXT: v_ashrrev_i32_e32 v19, 31, v17 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v16, v19 -; TONGA-NEXT: v_xor_b32_e32 v20, v13, v19 -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v20, v18, 0 -; TONGA-NEXT: v_mul_hi_u32 v21, v20, v15 -; TONGA-NEXT: v_addc_u32_e32 v17, vcc, v17, v19, vcc -; TONGA-NEXT: v_xor_b32_e32 v22, v17, v19 -; TONGA-NEXT: v_add_u32_e32 v21, vcc, v21, v13 -; TONGA-NEXT: v_addc_u32_e32 v23, vcc, 0, v14, vcc -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v22, v15, 0 -; TONGA-NEXT: v_mad_u64_u32 v[17:18], s[0:1], v22, v18, 0 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v21, v13 -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v23, v14, vcc -; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v18, vcc -; TONGA-NEXT: v_add_u32_e32 v15, vcc, v13, v17 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v15, v13 +; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; TONGA-NEXT: v_add_u32_e32 v15, vcc, v21, v13 +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, v20, v14, vcc +; TONGA-NEXT: v_ashrrev_i32_e32 v20, 31, v17 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v16, v20 +; TONGA-NEXT: v_xor_b32_e32 v21, v13, v20 +; TONGA-NEXT: v_addc_u32_e32 v17, vcc, v17, v20, vcc +; TONGA-NEXT: v_mul_hi_u32 v22, v21, v15 +; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v21, v19, 0 +; TONGA-NEXT: v_xor_b32_e32 v23, v17, v20 +; TONGA-NEXT: v_mad_u64_u32 v[17:18], s[0:1], v23, v15, 0 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v22, v13 +; TONGA-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v17 +; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v23, v19, 0 +; TONGA-NEXT: v_addc_u32_e32 v15, vcc, v15, v18, vcc +; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; TONGA-NEXT: v_add_u32_e32 v15, vcc, v15, v13 ; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc ; TONGA-NEXT: v_mul_lo_u32 v17, v11, v13 ; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v11, v15, 0 ; TONGA-NEXT: v_mul_lo_u32 v15, v10, v15 ; TONGA-NEXT: v_add_u32_e32 v14, vcc, v17, v14 ; TONGA-NEXT: v_add_u32_e32 v14, vcc, v15, v14 -; TONGA-NEXT: v_sub_u32_e32 v15, vcc, v22, v14 -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v20, v13 +; TONGA-NEXT: v_sub_u32_e32 v15, vcc, v23, v14 +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v21, v13 ; TONGA-NEXT: v_subb_u32_e64 v15, s[0:1], v15, v10, vcc ; TONGA-NEXT: v_sub_u32_e64 v17, s[0:1], v13, v11 ; TONGA-NEXT: v_subbrev_u32_e64 v18, s[2:3], 0, v15, s[0:1] ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v18, v10 -; TONGA-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[2:3] +; TONGA-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[2:3] ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v17, v11 ; TONGA-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[2:3] ; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v18, v10 ; TONGA-NEXT: v_subb_u32_e64 v15, s[0:1], v15, v10, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[2:3] +; TONGA-NEXT: v_cndmask_b32_e64 v19, v19, v21, s[2:3] ; TONGA-NEXT: v_sub_u32_e64 v21, s[0:1], v17, v11 ; TONGA-NEXT: v_subbrev_u32_e64 v15, s[0:1], 0, v15, s[0:1] -; TONGA-NEXT: v_subb_u32_e32 v14, vcc, v22, v14, vcc -; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v20 +; TONGA-NEXT: v_subb_u32_e32 v14, vcc, v23, v14, vcc +; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v19 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v14, v10 ; TONGA-NEXT: v_cndmask_b32_e64 v15, v18, v15, s[0:1] ; TONGA-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc @@ -6400,10 +6400,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; TONGA-NEXT: v_cndmask_b32_e32 v11, v13, v17, vcc ; TONGA-NEXT: v_cndmask_b32_e32 v10, v14, v15, vcc -; TONGA-NEXT: v_xor_b32_e32 v11, v11, v19 -; TONGA-NEXT: v_xor_b32_e32 v13, v10, v19 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v11, v19 -; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v13, v19, vcc +; TONGA-NEXT: v_xor_b32_e32 v11, v11, v20 +; TONGA-NEXT: v_xor_b32_e32 v13, v10, v20 +; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v11, v20 +; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v13, v20, vcc ; TONGA-NEXT: s_cbranch_execnz .LBB12_8 ; TONGA-NEXT: .LBB12_7: ; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v12 @@ -9042,33 +9042,33 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1) ; TONGA-NEXT: s_waitcnt vmcnt(1) ; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v1 ; TONGA-NEXT: v_lshrrev_b32_e32 v12, 30, v12 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v0, v12 -; TONGA-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc -; TONGA-NEXT: v_and_b32_e32 v12, -4, v12 ; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v12 +; TONGA-NEXT: v_add_u32_e32 v12, vcc, v0, v12 ; TONGA-NEXT: v_lshrrev_b32_e32 v13, 30, v13 -; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v2, v13 -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc -; TONGA-NEXT: v_and_b32_e32 v12, -4, v12 +; TONGA-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v5 -; TONGA-NEXT: v_ashrrev_i32_e32 v15, 31, v7 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v12 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v2, v13 ; TONGA-NEXT: v_lshrrev_b32_e32 v14, 30, v14 +; TONGA-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc +; TONGA-NEXT: v_ashrrev_i32_e32 v15, 31, v7 +; TONGA-NEXT: v_add_u32_e32 v14, vcc, v4, v14 ; TONGA-NEXT: v_lshrrev_b32_e32 v15, 30, v15 -; TONGA-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc -; TONGA-NEXT: v_add_u32_e64 v12, s[0:1], v4, v14 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v6, v15 -; TONGA-NEXT: v_addc_u32_e32 v15, vcc, 0, v7, vcc +; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v5, vcc +; TONGA-NEXT: v_add_u32_e32 v15, vcc, v6, v15 +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v7, vcc ; TONGA-NEXT: v_and_b32_e32 v12, -4, v12 -; TONGA-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v5, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v4, v12 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v12 ; TONGA-NEXT: v_and_b32_e32 v13, -4, v13 -; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v5, v14, vcc -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 -; TONGA-NEXT: v_subb_u32_e32 v7, vcc, v7, v15, vcc +; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v13 +; TONGA-NEXT: v_and_b32_e32 v14, -4, v14 +; TONGA-NEXT: v_subb_u32_e32 v3, vcc, v3, v17, vcc +; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v4, v14 +; TONGA-NEXT: v_and_b32_e32 v15, -4, v15 +; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v5, v18, vcc +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v15 +; TONGA-NEXT: v_subb_u32_e32 v7, vcc, v7, v19, vcc ; TONGA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; TONGA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; TONGA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll index 571c0f04c06ca..4459c4c4b7ed6 100644 --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -266,20 +266,20 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; SI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshr_b64 v[5:6], v[5:6], v2 -; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], v13 -; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], v11 -; SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], v13 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v11 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v8 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: lshr_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 1f93bf7a68972..326129541d9eb 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -841,18 +841,18 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_rcp_iflag_f32_e32 v14, v14 ; GCN-NEXT: v_rcp_iflag_f32_e32 v16, v16 ; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GCN-NEXT: v_sub_u32_e32 v11, vcc, 0, v0 ; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 ; GCN-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 ; GCN-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16 -; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GCN-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GCN-NEXT: v_sub_u32_e32 v11, vcc, 0, v0 +; GCN-NEXT: v_mul_lo_u32 v11, v11, v10 ; GCN-NEXT: v_sub_u32_e32 v13, vcc, 0, v1 ; GCN-NEXT: v_sub_u32_e32 v15, vcc, 0, v2 ; GCN-NEXT: v_sub_u32_e32 v17, vcc, 0, v3 -; GCN-NEXT: v_mul_lo_u32 v11, v11, v10 ; GCN-NEXT: v_mul_lo_u32 v13, v13, v12 ; GCN-NEXT: v_mul_lo_u32 v15, v15, v14 ; GCN-NEXT: v_mul_lo_u32 v17, v17, v16 @@ -861,54 +861,54 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_mul_hi_u32 v15, v14, v15 ; GCN-NEXT: v_mul_hi_u32 v17, v16, v17 ; GCN-NEXT: v_add_u32_e32 v10, vcc, v10, v11 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_hi_u32 v10, v6, v10 ; GCN-NEXT: v_add_u32_e32 v11, vcc, v12, v13 ; GCN-NEXT: v_add_u32_e32 v12, vcc, v14, v15 ; GCN-NEXT: v_add_u32_e32 v13, vcc, v16, v17 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_hi_u32 v10, v6, v10 ; GCN-NEXT: v_mul_hi_u32 v11, v7, v11 ; GCN-NEXT: v_mul_hi_u32 v12, v8, v12 ; GCN-NEXT: v_mul_hi_u32 v13, v9, v13 ; GCN-NEXT: v_mul_lo_u32 v14, v10, v0 ; GCN-NEXT: v_mul_lo_u32 v16, v11, v1 ; GCN-NEXT: v_mul_lo_u32 v18, v12, v2 -; GCN-NEXT: v_mul_lo_u32 v19, v13, v3 +; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 ; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v14 +; GCN-NEXT: v_mul_lo_u32 v14, v13, v3 ; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v16 ; GCN-NEXT: v_sub_u32_e32 v8, vcc, v8, v18 -; GCN-NEXT: v_sub_u32_e32 v9, vcc, v9, v19 -; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 +; GCN-NEXT: v_sub_u32_e32 v9, vcc, v9, v14 ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 -; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v12 +; GCN-NEXT: v_add_u32_e32 v19, vcc, 1, v12 ; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v13 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v6, v0 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v7, v1 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3 -; GCN-NEXT: v_sub_u32_e32 v18, vcc, v6, v0 +; GCN-NEXT: v_sub_u32_e32 v14, vcc, v6, v0 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1] ; GCN-NEXT: v_sub_u32_e32 v15, vcc, v7, v1 ; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3] ; GCN-NEXT: v_sub_u32_e32 v17, vcc, v8, v2 -; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] -; GCN-NEXT: v_sub_u32_e32 v14, vcc, v9, v3 +; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v19, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v16, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v18, s[0:1] -; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v10 +; GCN-NEXT: v_sub_u32_e32 v18, vcc, v9, v3 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[0:1] +; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v10 ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[2:3] ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v11 ; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v17, s[4:5] -; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v12 -; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7] -; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v13 +; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v12 +; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v13 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v16, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v14, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v1 +; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v18, s[6:7] ; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v15, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v16, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3 -; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll index 94448411cfd0e..a8fdd596d5a88 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll @@ -3055,27 +3055,27 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v8i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v5, v12, v[18:19] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v8, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v16, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v4, v20, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v15, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v10, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v7, v14, v[19:20] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v11, v[6:7] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v4, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v5, v23, v[7:8] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v3, v10, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v8, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v23, v4, v[19:20] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v16, v[21:22] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v8, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v9, v[17:18] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v11, v[19:20] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v1, v8, v[20:21] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[22:23] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v12, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v14, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v13, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v15, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v5, v12, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v14, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v18, v0, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v9, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v19, v[6:7] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v0, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v24, v2, v[12:13] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v6, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3121,27 +3121,27 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX8-GISEL-LABEL: test_vector_reduce_mul_v8i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v5, v12, v[18:19] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v8, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v16, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v4, v20, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v15, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v10, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v7, v14, v[19:20] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v11, v[6:7] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v4, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v5, v23, v[7:8] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v3, v10, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v8, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v23, v4, v[19:20] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v16, v[21:22] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v8, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v9, v[17:18] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v11, v[19:20] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v1, v8, v[20:21] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[22:23] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v12, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v14, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v13, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v15, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v5, v12, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v14, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v18, v0, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v9, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v19, v[6:7] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v0, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v24, v2, v[12:13] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v6, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3434,268 +3434,268 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v19, v2, v19 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v32, v3, v18 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v18, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v27, v10, v27 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v33, v11, v26 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v10, v26, 0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, v3, v19 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v15, v15, v30 -; GFX7-SDAG-NEXT: v_add_i32_e32 v32, vcc, v3, v32 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v14, v30, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v30, v4, v21 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v20, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v5, v5, v20 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v29, v12, v29 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v34, v13, v28 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v21, v4, v21 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v32, v5, v20 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v20, 0 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v20, v12, v29 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v29, v13, v28 ; GFX7-SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v12, v28, 0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v27 -; GFX7-SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v33 -; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, v4, v30 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v10, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v2, v2, v11 -; GFX7-SDAG-NEXT: v_add_i32_e32 v11, vcc, v13, v29 -; GFX7-SDAG-NEXT: v_add_i32_e32 v13, vcc, v4, v5 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v17, v0, v17 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v28, v1, v16 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v16, 0 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v25, v8, v25 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v33, v9, v24 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v8, v24, 0 ; GFX7-SDAG-NEXT: v_mul_lo_u32 v23, v6, v23 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v26, v7, v22 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v24, v7, v22 ; GFX7-SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, v22, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v17, v0, v17 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v10, v32, v10 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v19, v2, v19 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v22, v3, v18 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v18, 0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v17 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v18, v15, v30 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v14, v30, 0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, v3, v19 +; GFX7-SDAG-NEXT: v_add_i32_e32 v19, vcc, v1, v28 +; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, v5, v21 +; GFX7-SDAG-NEXT: v_add_i32_e32 v9, vcc, v9, v25 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v21, v10, v27 +; GFX7-SDAG-NEXT: v_add_i32_e32 v17, vcc, v5, v32 +; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, v9, v33 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v26, 0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, v3, v22 +; GFX7-SDAG-NEXT: v_add_i32_e32 v10, vcc, v10, v21 ; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, v7, v23 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v6, v18, 0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, v7, v26 -; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v21, v2 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v7, v7, v18 -; GFX7-SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v34 +; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, v7, v24 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v7, v7, v15 +; GFX7-SDAG-NEXT: v_add_i32_e32 v13, vcc, v13, v20 +; GFX7-SDAG-NEXT: v_add_i32_e32 v13, vcc, v13, v29 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_mul_lo_u32 v4, v14, v31 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v14, v1, v16 -; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, v19, v4 -; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v6, v6, v4 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v16, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v15, v8, v25 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v24, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v8, v9, v24 -; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, v5, v17 -; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, v2, v10 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v9, v4, v1 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v0, 0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, v23, v6 -; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, v6, v7 -; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, v2, v9 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v7, v3, v11 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v12, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v5, v5, v0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v13, v12 -; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GFX7-SDAG-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v3, v1, v0 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v2, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v2, v5, v2 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v5, v8, v22 -; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, v1, v3 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v1, v20, v4 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v20, v22, 0 -; GFX7-SDAG-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v4, v0, v1 -; GFX7-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v3, 0 -; GFX7-SDAG-NEXT: v_mul_lo_u32 v2, v2, v3 -; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v1, v14, v31 +; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v16, v1 +; GFX7-SDAG-NEXT: v_add_i32_e32 v14, vcc, v1, v18 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v1, v11, v26 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v11, v6, v14 +; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v10, v2, v1 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v9, 0 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v9, v3, v9 +; GFX7-SDAG-NEXT: v_add_i32_e32 v10, vcc, v2, v10 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v15, 0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v10, v0, v5 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v8, 0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v3, v7 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v7, v4, v13 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v12, 0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v10, v17, v12 +; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v8, v19, v8 +; GFX7-SDAG-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v7, v5, v4 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v3, 0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v8, vcc, v6, v8 +; GFX7-SDAG-NEXT: v_add_i32_e32 v7, vcc, v5, v7 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v2, 0 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v1, v9, v2 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v3, v8, v3 +; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v2, v4, v0 +; GFX7-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GFX7-SDAG-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GFX7-SDAG-NEXT: v_mul_lo_u32 v3, v3, v5 ; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GFX7-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-GISEL-LABEL: test_vector_reduce_mul_v16i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v0, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v0, v17, v[32:33] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v1, v16, v[33:34] +; GFX7-GISEL-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v18, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v19, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v20, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[4:5], v3, v18, v[16:17] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v4, v21, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v22, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[37:38], s[4:5], v5, v20, v[32:33] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v24, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v10, v26, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v23, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v28, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[38:39], s[4:5], v10, v27, v[17:18] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v14, v30, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[50:51], s[4:5], v7, v22, v[19:20] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v9, v24, v[32:33] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[48:49], s[4:5], v12, v29, v[6:7] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v31, v4, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v0, v17, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[4:5], v8, v35, v[25:26] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[32:33] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[36:37] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v11, v26, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v18, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v19, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v0, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v8, v31, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v18, v[10:11] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v8, v0, v[25:26] -; GFX7-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v29, v[3:4] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v20, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v13, v28, v[10:11] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v21, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[27:28], s[4:5], v8, v18, v[11:12] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v11, v26, v[38:39] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v13, v28, v[48:49] ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v0, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v30, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v22, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v23, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v3, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v13, v[12:13] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v20, v[25:26] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v22, v[14:15] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v2, v[27:28] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[18:19] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v10, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v7, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v24, v12, v[6:7] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v11, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v6, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v34, v[18:19] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v30, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v31, v19, v[7:8] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v0, v20, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v4, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v5, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v2, v17, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v36, v16, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v1, v21, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v37, v5, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v50, v17, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v9, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v15, v[11:12] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v19, v9, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v4, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v18, v3, v[7:8] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v4, v[6:7] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v19, v2, v19 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v32, v3, v18 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v18, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v27, v10, v27 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v33, v11, v26 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v10, v26, 0 -; GFX8-SDAG-NEXT: v_add_u32_e32 v3, vcc, v3, v19 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v15, v15, v30 -; GFX8-SDAG-NEXT: v_add_u32_e32 v32, vcc, v3, v32 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v14, v30, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v30, v4, v21 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v20, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v5, v5, v20 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v29, v12, v29 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v34, v13, v28 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v21, v4, v21 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v32, v5, v20 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v20, 0 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v20, v12, v29 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v29, v13, v28 ; GFX8-SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v12, v28, 0 -; GFX8-SDAG-NEXT: v_add_u32_e32 v11, vcc, v11, v27 -; GFX8-SDAG-NEXT: v_add_u32_e32 v11, vcc, v11, v33 -; GFX8-SDAG-NEXT: v_add_u32_e32 v4, vcc, v4, v30 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v10, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v2, v2, v11 -; GFX8-SDAG-NEXT: v_add_u32_e32 v11, vcc, v13, v29 -; GFX8-SDAG-NEXT: v_add_u32_e32 v13, vcc, v4, v5 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v17, v0, v17 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v28, v1, v16 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v16, 0 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v25, v8, v25 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v33, v9, v24 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v8, v24, 0 ; GFX8-SDAG-NEXT: v_mul_lo_u32 v23, v6, v23 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v26, v7, v22 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v24, v7, v22 ; GFX8-SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, v22, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v17, v0, v17 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v10, v32, v10 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v19, v2, v19 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v22, v3, v18 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v18, 0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v1, v17 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v18, v15, v30 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v14, v30, 0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v3, vcc, v3, v19 +; GFX8-SDAG-NEXT: v_add_u32_e32 v19, vcc, v1, v28 +; GFX8-SDAG-NEXT: v_add_u32_e32 v5, vcc, v5, v21 +; GFX8-SDAG-NEXT: v_add_u32_e32 v9, vcc, v9, v25 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v21, v10, v27 +; GFX8-SDAG-NEXT: v_add_u32_e32 v17, vcc, v5, v32 +; GFX8-SDAG-NEXT: v_add_u32_e32 v5, vcc, v9, v33 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v26, 0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v3, vcc, v3, v22 +; GFX8-SDAG-NEXT: v_add_u32_e32 v10, vcc, v10, v21 ; GFX8-SDAG-NEXT: v_add_u32_e32 v7, vcc, v7, v23 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v6, v18, 0 -; GFX8-SDAG-NEXT: v_add_u32_e32 v7, vcc, v7, v26 -; GFX8-SDAG-NEXT: v_add_u32_e32 v2, vcc, v21, v2 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v7, v7, v18 -; GFX8-SDAG-NEXT: v_add_u32_e32 v11, vcc, v11, v34 +; GFX8-SDAG-NEXT: v_add_u32_e32 v7, vcc, v7, v24 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v7, v7, v15 +; GFX8-SDAG-NEXT: v_add_u32_e32 v13, vcc, v13, v20 +; GFX8-SDAG-NEXT: v_add_u32_e32 v13, vcc, v13, v29 ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX8-SDAG-NEXT: v_mul_lo_u32 v4, v14, v31 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v14, v1, v16 -; GFX8-SDAG-NEXT: v_add_u32_e32 v4, vcc, v19, v4 -; GFX8-SDAG-NEXT: v_add_u32_e32 v4, vcc, v4, v15 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v6, v6, v4 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v16, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v15, v8, v25 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v24, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v8, v9, v24 -; GFX8-SDAG-NEXT: v_add_u32_e32 v5, vcc, v5, v17 -; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v1, v15 -; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v1, v8 -; GFX8-SDAG-NEXT: v_add_u32_e32 v8, vcc, v2, v10 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v9, v4, v1 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v0, 0 -; GFX8-SDAG-NEXT: v_add_u32_e32 v6, vcc, v23, v6 -; GFX8-SDAG-NEXT: v_add_u32_e32 v5, vcc, v5, v14 -; GFX8-SDAG-NEXT: v_add_u32_e32 v4, vcc, v6, v7 -; GFX8-SDAG-NEXT: v_add_u32_e32 v6, vcc, v2, v9 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v7, v3, v11 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v12, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v5, v5, v0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v0, v13, v12 -; GFX8-SDAG-NEXT: v_add_u32_e32 v3, vcc, v3, v7 -; GFX8-SDAG-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v3, v1, v0 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v2, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v2, v5, v2 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v5, v8, v22 -; GFX8-SDAG-NEXT: v_add_u32_e32 v6, vcc, v1, v3 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v1, v20, v4 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v20, v22, 0 -; GFX8-SDAG-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v4, v1 -; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v1, v5 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v4, v0, v1 -; GFX8-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v3, 0 -; GFX8-SDAG-NEXT: v_mul_lo_u32 v2, v2, v3 -; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v1, v4 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v1, v14, v31 +; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v16, v1 +; GFX8-SDAG-NEXT: v_add_u32_e32 v14, vcc, v1, v18 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v1, v11, v26 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v11, v6, v14 +; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v10, v1 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v10, v2, v1 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v9, 0 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v9, v3, v9 +; GFX8-SDAG-NEXT: v_add_u32_e32 v10, vcc, v2, v10 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v15, 0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v9, vcc, v10, v9 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v10, v0, v5 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v8, 0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v3, vcc, v3, v11 +; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v3, v7 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v7, v4, v13 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v12, 0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v6, vcc, v6, v10 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v10, v17, v12 +; GFX8-SDAG-NEXT: v_add_u32_e32 v4, vcc, v4, v7 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v8, v19, v8 +; GFX8-SDAG-NEXT: v_add_u32_e32 v4, vcc, v4, v10 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v7, v5, v4 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v3, 0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v8, vcc, v6, v8 +; GFX8-SDAG-NEXT: v_add_u32_e32 v7, vcc, v5, v7 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v2, 0 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v1, v9, v2 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v3, v8, v3 +; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v2, v4, v0 +; GFX8-SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GFX8-SDAG-NEXT: v_add_u32_e32 v3, vcc, v7, v3 +; GFX8-SDAG-NEXT: v_mul_lo_u32 v3, v3, v5 ; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-SDAG-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_mul_v16i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v0, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v0, v17, v[32:33] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v1, v16, v[33:34] +; GFX8-GISEL-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v18, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v19, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v20, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[4:5], v3, v18, v[16:17] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v4, v21, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v22, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[37:38], s[4:5], v5, v20, v[32:33] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v24, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v10, v26, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v23, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v28, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[38:39], s[4:5], v10, v27, v[17:18] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v14, v30, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[50:51], s[4:5], v7, v22, v[19:20] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v9, v24, v[32:33] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[48:49], s[4:5], v12, v29, v[6:7] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v31, v4, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v0, v17, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[4:5], v8, v35, v[25:26] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[32:33] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[36:37] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v11, v26, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v18, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v19, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v0, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v8, v31, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v18, v[10:11] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v8, v0, v[25:26] -; GFX8-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v29, v[3:4] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v20, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v13, v28, v[10:11] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v21, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[27:28], s[4:5], v8, v18, v[11:12] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v11, v26, v[38:39] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v13, v28, v[48:49] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v0, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v15, v30, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v22, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v23, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v3, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v13, v[12:13] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v20, v[25:26] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v22, v[14:15] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v2, v[27:28] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[18:19] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v10, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v7, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v24, v12, v[6:7] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v11, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v6, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v34, v[18:19] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v30, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v31, v19, v[7:8] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v0, v20, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v4, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v5, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v2, v17, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v36, v16, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v1, v21, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v37, v5, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v50, v17, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v9, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v15, v[11:12] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v19, v9, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v4, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v18, v3, v[7:8] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v4, v[6:7] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3902,53 +3902,53 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v16i64: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: buffer_load_dword v35, off, s[0:3], s32 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[31:32], s4, v14, v30, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[33:34], s4, v6, v22, 0 +; GFX10-GISEL-NEXT: buffer_load_dword v55, off, s[0:3], s32 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[31:32], s4, v0, v16, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[33:34], s4, v2, v18, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[35:36], s4, v4, v20, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[37:38], s4, v6, v22, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[50:51], s4, v10, v26, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[70:71], s4, v0, v17, v[32:33] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[82:83], s4, v2, v19, v[34:35] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[48:49], s4, v8, v24, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[83:84], s4, v4, v21, v[36:37] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[52:53], s4, v12, v28, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[64:65], s4, v14, v30, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[66:67], s4, v33, v50, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v1, v16, v[70:71] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v6, v23, v[38:39] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[84:85], s4, v8, v25, v[49:50] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v3, v18, v[82:83] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[85:86], s4, v10, v27, v[51:52] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[86:87], s4, v12, v29, v[53:54] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v7, v22, v[38:39] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[80:81], s4, v37, v64, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v5, v20, v[83:84] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v9, v24, v[84:85] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[68:69], s4, v35, v52, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v11, v26, v[85:86] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v13, v28, v[86:87] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[53:54], s4, v31, v48, 0 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v14, v35, v[32:33] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[37:38], s4, v6, v23, v[34:35] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[34:35], s4, v33, v31, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v15, v30, v[36:37] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v0, v16, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v7, v22, v[37:38] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v2, v18, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[22:23], s4, v4, v20, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[32:33], s4, v33, v38, v[35:36] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[37:38], s4, v0, v17, v[15:16] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v2, v19, v[7:8] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[35:36], s4, v36, v31, v[32:33] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[30:31], s4, v8, v24, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[32:33], s4, v10, v26, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[15:16], s4, v1, v16, v[37:38] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v3, v18, v[38:39] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v28, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v4, v21, v[23:24] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v8, v25, v[31:32] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v10, v27, v[33:34] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v12, v29, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[18:19], s4, v5, v20, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v9, v24, v[3:4] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v11, v26, v[7:8] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v6, v32, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v13, v28, v[17:18] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v22, v0, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v14, v30, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v6, v9, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v22, v10, v[8:9] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v3, v34, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v14, v19, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v1, v7, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v32, v[11:12] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v18, v0, v[12:13] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v15, v30, v[13:14] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v35, v[9:10] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v1, v11, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v8, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v10, v34, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v12, v7, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v4, v2, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v5, v8, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v14, v55, v[65:66] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v15, v30, v[2:3] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v53, v68, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v37, v4, v[81:82] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v33, v8, v[67:68] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v35, v9, v[69:70] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v66, v80, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v6, v64, v[10:11] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v31, v7, v[54:55] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v1, v50, v[11:12] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v16, v52, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v66, v9, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v0, v48, v[6:7] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v53, v7, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v10, v80, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v4, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v68, v[6:7] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v7, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v5, v4, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v16i64: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll index 8d0e00383d692..f60189e0305a5 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll @@ -3927,38 +3927,38 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[0:1], v[16:17] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[12:13], v[28:29] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[8:9], v[4:5], v[20:21] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[12:13], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[14:15], v[2:3], v[18:19] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[8:9] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[14:15] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[14:15] ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[4:5], v[12:13] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[10:11], v[6:7], v[22:23] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[10:11] ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[14:15], v[30:31] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc @@ -3979,29 +3979,17 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17] -; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[6:7], v[4:5], v[20:21] +; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[10:11], v[8:9], v[24:25] +; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[14:15], v[12:13], v[28:29] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[14:15] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] -; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[2:3], v[10:11] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[14:15] ; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[8:9] ; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[6:7], v[4:5], v[12:13] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -4009,8 +3997,20 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] +; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[2:3], v[18:19] +; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[8:9], v[6:7], v[22:23] +; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[12:13], v[10:11], v[26:27] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[8:9] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[8:9] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX7-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[2:3], v[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[14:15], v[30:31] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc @@ -4031,38 +4031,38 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[0:1], v[16:17] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[12:13], v[28:29] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[8:9], v[4:5], v[20:21] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[12:13], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[14:15], v[2:3], v[18:19] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[8:9] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[14:15] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[14:15] ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[4:5], v[12:13] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[10:11], v[6:7], v[22:23] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[10:11] ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[14:15], v[30:31] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc @@ -4083,29 +4083,17 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17] -; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[6:7], v[4:5], v[20:21] +; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[10:11], v[8:9], v[24:25] +; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[14:15], v[12:13], v[28:29] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[14:15] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] -; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[2:3], v[10:11] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[14:15] ; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[8:9] ; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[6:7], v[4:5], v[12:13] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -4113,8 +4101,20 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] +; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[2:3], v[18:19] +; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[8:9], v[6:7], v[22:23] +; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[12:13], v[10:11], v[26:27] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[8:9] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[8:9] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX8-GISEL-NEXT: v_cmp_gt_i64_e64 s[4:5], v[2:3], v[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[14:15], v[30:31] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll index f15ecf014ab0b..2c91bdc08b271 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll @@ -3926,38 +3926,38 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[0:1], v[16:17] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[12:13], v[28:29] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[8:9], v[4:5], v[20:21] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[12:13], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[14:15], v[2:3], v[18:19] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[8:9] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[14:15] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[14:15] ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[12:13] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[10:11], v[6:7], v[22:23] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[10:11] ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[14:15], v[30:31] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc @@ -3978,29 +3978,17 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17] -; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[6:7], v[4:5], v[20:21] +; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[10:11], v[8:9], v[24:25] +; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[14:15], v[12:13], v[28:29] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[14:15] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] -; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[10:11] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[14:15] ; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[8:9] ; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[6:7], v[4:5], v[12:13] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -4008,8 +3996,20 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] +; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[18:19] +; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[8:9], v[6:7], v[22:23] +; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[12:13], v[10:11], v[26:27] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[8:9] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[8:9] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX7-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[14:15], v[30:31] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc @@ -4030,38 +4030,38 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[0:1], v[16:17] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[12:13], v[28:29] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[8:9], v[4:5], v[20:21] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[12:13], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[14:15], v[2:3], v[18:19] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[8:9] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[14:15] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[14:15] ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[12:13] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[10:11], v[6:7], v[22:23] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[10:11] ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[14:15], v[30:31] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc @@ -4082,29 +4082,17 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17] -; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[6:7], v[4:5], v[20:21] +; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[10:11], v[8:9], v[24:25] +; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[14:15], v[12:13], v[28:29] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[14:15] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] -; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[10:11] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[14:15] ; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[8:9] ; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[6:7], v[4:5], v[12:13] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -4112,8 +4100,20 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] +; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[18:19] +; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[8:9], v[6:7], v[22:23] +; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[12:13], v[10:11], v[26:27] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[8:9] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[8:9] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX8-GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[14:15], v[30:31] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll index e62165cb933c5..e310aa2b9f603 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll @@ -3803,38 +3803,38 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[16:17] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[12:13], v[28:29] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[8:9], v[4:5], v[20:21] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[12:13], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[14:15], v[2:3], v[18:19] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[8:9] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[14:15] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[14:15] ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[12:13] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[10:11], v[6:7], v[22:23] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[10:11] ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[14:15], v[30:31] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc @@ -3855,29 +3855,17 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17] -; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[6:7], v[4:5], v[20:21] +; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[10:11], v[8:9], v[24:25] +; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[14:15], v[12:13], v[28:29] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[14:15] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] -; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[10:11] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc -; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[14:15] ; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[6:7], v[4:5], v[12:13] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -3885,8 +3873,20 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[18:19] +; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[8:9], v[6:7], v[22:23] +; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[12:13], v[10:11], v[26:27] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[8:9] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[8:9] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX7-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[14:15], v[30:31] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc @@ -3907,38 +3907,38 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[16:17] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[12:13], v[28:29] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[8:9], v[4:5], v[20:21] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[12:13], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[14:15], v[2:3], v[18:19] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[8:9] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[14:15] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[14:15] ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[12:13] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[10:11], v[6:7], v[22:23] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[10:11] ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[14:15], v[30:31] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc @@ -3959,29 +3959,17 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17] -; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[6:7], v[4:5], v[20:21] +; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[10:11], v[8:9], v[24:25] +; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[14:15], v[12:13], v[28:29] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[14:15] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] -; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[10:11] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc -; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[14:15] ; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[6:7], v[4:5], v[12:13] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -3989,8 +3977,20 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[18:19] +; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[8:9], v[6:7], v[22:23] +; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[12:13], v[10:11], v[26:27] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[8:9] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[8:9] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX8-GISEL-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[14:15], v[30:31] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll index 83ecaaa7e0846..10c7f4baccb44 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll @@ -3539,38 +3539,38 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[0:1], v[16:17] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[12:13], v[28:29] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[8:9], v[4:5], v[20:21] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[12:13], v[10:11], v[26:27] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[14:15], v[2:3], v[18:19] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[8:9] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[14:15] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[14:15] ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[4:5], v[12:13] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[10:11], v[6:7], v[22:23] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[10:11] ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[30:31] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc @@ -3591,29 +3591,17 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17] -; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[6:7], v[4:5], v[20:21] +; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[10:11], v[8:9], v[24:25] +; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[14:15], v[12:13], v[28:29] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[14:15] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] -; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[2:3], v[10:11] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc -; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29] -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[14:15] ; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] ; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[6:7], v[4:5], v[12:13] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -3621,8 +3609,20 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[2:3], v[18:19] +; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[8:9], v[6:7], v[22:23] +; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[12:13], v[10:11], v[26:27] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[8:9] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[8:9] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX7-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[2:3], v[10:11] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[30:31] ; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc @@ -3643,38 +3643,38 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[0:1], v[16:17] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[12:13], v[28:29] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[8:9], v[4:5], v[20:21] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[12:13], v[10:11], v[26:27] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[14:15], v[2:3], v[18:19] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[8:9] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[14:15] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[14:15] ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[4:5], v[12:13] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[10:11], v[6:7], v[22:23] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[10:11] ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[30:31] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc @@ -3695,29 +3695,17 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17] -; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27] +; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[6:7], v[4:5], v[20:21] +; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[10:11], v[8:9], v[24:25] +; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[14:15], v[12:13], v[28:29] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[14:15] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] -; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[2:3], v[10:11] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc -; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[6:7] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[14:15] ; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] ; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[6:7], v[4:5], v[12:13] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc @@ -3725,8 +3713,20 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] ; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[2:3], v[18:19] +; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[8:9], v[6:7], v[22:23] +; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[12:13], v[10:11], v[26:27] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[8:9] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[12:13] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[8:9] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[12:13] +; GFX8-GISEL-NEXT: v_cmp_lt_u64_e64 s[4:5], v[2:3], v[10:11] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] +; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[30:31] ; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index 5e18b469a4e88..b75cbe53f60aa 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -257,26 +257,27 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[0:1] offset:240 -; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[0:1] offset:224 -; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[0:1] offset:208 -; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[0:1] offset:192 -; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[0:1] offset:176 -; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[0:1] offset:160 -; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:144 -; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1] offset:128 -; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[0:1] offset:112 -; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[0:1] offset:96 -; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[0:1] offset:80 -; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[0:1] offset:64 -; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[0:1] offset:48 -; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[0:1] offset:32 -; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[0:1] offset:16 -; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[0:1] +; GFX942-NEXT: v_lshlrev_b32_e32 v50, 3, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx4 v[30:33], v50, s[0:1] offset:240 +; GFX942-NEXT: global_load_dwordx4 v[26:29], v50, s[0:1] offset:224 +; GFX942-NEXT: global_load_dwordx4 v[22:25], v50, s[0:1] offset:208 +; GFX942-NEXT: global_load_dwordx4 v[18:21], v50, s[0:1] offset:192 +; GFX942-NEXT: global_load_dwordx4 v[14:17], v50, s[0:1] offset:176 +; GFX942-NEXT: global_load_dwordx4 v[10:13], v50, s[0:1] offset:160 +; GFX942-NEXT: global_load_dwordx4 v[6:9], v50, s[0:1] offset:144 +; GFX942-NEXT: global_load_dwordx4 v[2:5], v50, s[0:1] offset:128 +; GFX942-NEXT: global_load_dwordx4 v[46:49], v50, s[0:1] offset:112 +; GFX942-NEXT: global_load_dwordx4 v[42:45], v50, s[0:1] offset:96 +; GFX942-NEXT: global_load_dwordx4 v[38:41], v50, s[0:1] offset:80 +; GFX942-NEXT: global_load_dwordx4 v[34:37], v50, s[0:1] offset:64 +; GFX942-NEXT: global_load_dwordx4 a[0:3], v50, s[0:1] offset:48 +; GFX942-NEXT: global_load_dwordx4 v[58:61], v50, s[0:1] offset:32 +; GFX942-NEXT: global_load_dwordx4 v[54:57], v50, s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_load_dwordx4 v[50:53], v50, s[0:1] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB6_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 @@ -289,32 +290,32 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX942-NEXT: global_load_dwordx4 v[10:13], v0, s[2:3] offset:160 ; GFX942-NEXT: global_load_dwordx4 v[6:9], v0, s[2:3] offset:144 ; GFX942-NEXT: global_load_dwordx4 v[2:5], v0, s[2:3] offset:128 -; GFX942-NEXT: global_load_dwordx4 a[0:3], v0, s[2:3] offset:112 -; GFX942-NEXT: global_load_dwordx4 v[58:61], v0, s[2:3] offset:96 -; GFX942-NEXT: global_load_dwordx4 v[54:57], v0, s[2:3] offset:80 -; GFX942-NEXT: global_load_dwordx4 v[50:53], v0, s[2:3] offset:64 -; GFX942-NEXT: global_load_dwordx4 v[46:49], v0, s[2:3] offset:48 -; GFX942-NEXT: global_load_dwordx4 v[42:45], v0, s[2:3] offset:32 -; GFX942-NEXT: global_load_dwordx4 v[38:41], v0, s[2:3] offset:16 -; GFX942-NEXT: global_load_dwordx4 v[34:37], v0, s[2:3] +; GFX942-NEXT: global_load_dwordx4 v[46:49], v0, s[2:3] offset:112 +; GFX942-NEXT: global_load_dwordx4 v[42:45], v0, s[2:3] offset:96 +; GFX942-NEXT: global_load_dwordx4 v[38:41], v0, s[2:3] offset:80 +; GFX942-NEXT: global_load_dwordx4 v[34:37], v0, s[2:3] offset:64 +; GFX942-NEXT: global_load_dwordx4 a[0:3], v0, s[2:3] offset:48 +; GFX942-NEXT: global_load_dwordx4 v[58:61], v0, s[2:3] offset:32 +; GFX942-NEXT: global_load_dwordx4 v[54:57], v0, s[2:3] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[50:53], v0, s[2:3] ; GFX942-NEXT: .LBB6_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] offset:112 +; GFX942-NEXT: global_store_dwordx4 v1, v[46:49], s[6:7] offset:112 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v1, v[58:61], s[6:7] offset:96 +; GFX942-NEXT: global_store_dwordx4 v1, v[42:45], s[6:7] offset:96 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v1, v[54:57], s[6:7] offset:80 +; GFX942-NEXT: global_store_dwordx4 v1, v[38:41], s[6:7] offset:80 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v1, v[50:53], s[6:7] offset:64 +; GFX942-NEXT: global_store_dwordx4 v1, v[34:37], s[6:7] offset:64 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v1, v[46:49], s[6:7] offset:48 +; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] offset:48 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v1, v[42:45], s[6:7] offset:32 +; GFX942-NEXT: global_store_dwordx4 v1, v[58:61], s[6:7] offset:32 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v1, v[38:41], s[6:7] offset:16 +; GFX942-NEXT: global_store_dwordx4 v1, v[54:57], s[6:7] offset:16 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v1, v[34:37], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v1, v[50:53], s[6:7] ; GFX942-NEXT: global_store_dwordx4 v1, v[30:33], s[6:7] offset:240 ; GFX942-NEXT: global_store_dwordx4 v1, v[26:29], s[6:7] offset:224 ; GFX942-NEXT: global_store_dwordx4 v1, v[22:25], s[6:7] offset:208